// -----// IR Dump Before AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
  func.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
  func.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before IREEImportPublicPass (iree-import-public) //----- //
module {
  func.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before ImportMLProgramPass (iree-import-ml-program) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cst_0 = arith.constant 0.000000e+00 : f32
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
  util.func public @main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = util.call @_main(%4, %5) : (tensor<?x?x?x?xf32>, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
  util.func private @_main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before Inliner (inline) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = util.call @_main(%4, %5) : (tensor<?x?x?x?xf32>, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
  util.func private @_main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
    %c2 = arith.constant 2 : index
    %c11 = arith.constant 11 : index
    %c13 = arith.constant 13 : index
    %c6 = arith.constant 6 : index
    %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %3 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    util.return %0 : tensor<2x4x7x9xf32>
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func private @_main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
  %c2 = arith.constant 2 : index
  %c11 = arith.constant 11 : index
  %c13 = arith.constant 13 : index
  %c6 = arith.constant 6 : index
  %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %3 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  util.return %0 : tensor<2x4x7x9xf32>
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func private @_main(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32> {
  %c2 = arith.constant 2 : index
  %c11 = arith.constant 11 : index
  %c13 = arith.constant 13 : index
  %c6 = arith.constant 6 : index
  %0 = flow.dispatch.workgroups(%arg0, %arg1, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %2 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %3 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %1 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %5 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %2 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%4 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %5, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  util.return %0 : tensor<2x4x7x9xf32>
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = util.call @_main(%4, %5) : (tensor<?x?x?x?xf32>, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = util.call @_main(%4, %5) : (tensor<?x?x?x?xf32>, tensor<4x6x5x5xf32>) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %c2 = arith.constant 2 : index
  %c11 = arith.constant 11 : index
  %c13 = arith.constant 13 : index
  %c6 = arith.constant 6 : index
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Inliner (inline) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
module {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
module attributes {hal.device.targets = [#hal.device.alias<"llvm-cpu"> : !hal.device]} {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
module attributes {hal.device.targets = [#hal.device.alias<"llvm-cpu"> : !hal.device]} {
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"llvm-cpu"> : !hal.device
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SetEncodingPass (iree-dispatch-creation-set-encoding) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CPUMaterializeHostEncodingPass (iree-codegen-cpu-materialize-host-encoding) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before JitGlobalsPass (iree-consteval-jit-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
      %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %10 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups(%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%arg4, %arg5, %arg6, %arg7], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg4, %arg5, %arg6, %arg7} -> tensor<?x?x?x?xf32>
    %9 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %10 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %8 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %12 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %9 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%11 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %12, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %8 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %9 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %10 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %11 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%8, %9, %10, %11} -> tensor<?x?x?x?xf32>
    %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %14 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %12 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before VerifyInputLegalityPass (iree-verify-input-legality) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %8 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %9 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %10 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %11 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %cst = arith.constant 0.000000e+00 : f32
      %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%8, %9, %10, %11} -> tensor<?x?x?x?xf32>
      %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %14 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %12 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %8 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %9 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %10 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %11 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %cst = arith.constant 0.000000e+00 : f32
      %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%8, %9, %10, %11} -> tensor<?x?x?x?xf32>
      %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %14 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %12 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %8 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %9 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %10 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %11 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %12 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, 0], sizes = [%8, %9, %10, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%8, %9, %10, %11} -> tensor<?x?x?x?xf32>
    %13 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %14 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %12 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %16 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %13 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %16, %arg8, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%9, %10, %11, %12} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%9, %10, %11, %12} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
    %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
    %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %15 = tensor.empty() : tensor<2x4x7x9xf32>
    %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
    %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    flow.return
  } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
      %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
      %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %15 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
      %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
      %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %15 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch.workgroups[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11}
      %9 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %10 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %11 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %12 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %13 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%9, %10, %11, %12], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg8, %arg9, %arg10, %arg11} -> tensor<?x?x?x?xf32>
      %14 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %15 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %13 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %17 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %14 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%16 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %17, %arg12, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      flow.return
    } count(%arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @main_dispatch_0 {
  flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
      %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
      %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
      %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
      %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %7 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @main_dispatch_0 {
  flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
      %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
      %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
      %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
      %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %7 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before OutlineConstantsPass (iree-flow-outline-constants) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
  %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
  %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @main_dispatch_0 {
    flow.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %1 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %4 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [%1, %2, %3, %4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %7 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %5 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %6 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%8 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %9, %arg10, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %4 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3}
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<4x6x5x5xf32>
    %6 = flow.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%4, %5, %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (tensor<?x?x?x?xf32>{%c2, %c6, %c11, %c13}, tensor<4x6x5x5xf32>, index, index, index, index, index, index, index, index) -> tensor<2x4x7x9xf32>
    %7 = hal.tensor.export %6 "output0" : tensor<2x4x7x9xf32> -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c4 = arith.constant 4 : index
    %c6_2 = arith.constant 6 : index
    %c5 = arith.constant 5 : index
    %c5_3 = arith.constant 5 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %c0 = arith.constant 0 : index
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c4 = arith.constant 4 : index
    %c6_2 = arith.constant 6 : index
    %c5 = arith.constant 5 : index
    %c5_3 = arith.constant 5 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %c0 = arith.constant 0 : index
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c4 = arith.constant 4 : index
    %c6_2 = arith.constant 6 : index
    %c5 = arith.constant 5 : index
    %c5_3 = arith.constant 5 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %c0 = arith.constant 0 : index
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before Inliner (inline) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c4 = arith.constant 4 : index
    %c6_2 = arith.constant 6 : index
    %c5 = arith.constant 5 : index
    %c5_3 = arith.constant 5 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %c0 = arith.constant 0 : index
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
  %c0 = arith.constant 0 : index
  %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %cst = arith.constant 0.000000e+00 : f32
  %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
  %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
  %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
  %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
  %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
  %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
  %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %9 = tensor.empty() : tensor<2x4x7x9xf32>
  %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
  %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
  %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
  %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
  %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
  %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
  %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
  %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %9 = tensor.empty() : tensor<2x4x7x9xf32>
  %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
  %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  %c4 = arith.constant 4 : index
  %c6_2 = arith.constant 6 : index
  %c5 = arith.constant 5 : index
  %c5_3 = arith.constant 5 : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6_2, %c5, %c5_3]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %c0 = arith.constant 0 : index
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After Inliner (inline) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
    %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
    %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
    %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
}


// -----// IR Dump Before EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @main_dispatch_0 {
  stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
      %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
      %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
      %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
      %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
      %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
      %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
      %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %9 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @main_dispatch_0 {
  stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
      %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
      %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
      %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
      %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
      %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
      %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
      %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %9 = tensor.empty() : tensor<2x4x7x9xf32>
      %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
      %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump Before EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?x?x?xf32>{%0, %1, %2, %3} : index
  %5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%4}
  %6 = stream.async.transfer %5 : !stream.resource<external>{%4} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%4}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<4x6x5x5xf32> : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x4x7x9xf32> : index
  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%6[%c0 to %4 for %4], %9[%c0 to %7 for %7], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%4}, !stream.resource<*>{%7}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%10}
  %12 = stream.async.transfer %11 : !stream.resource<*>{%10} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%10}
  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2x4x7x9xf32> in !stream.resource<external>{%10} -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
}

// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
  %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump Before RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %10 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%c2400} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c2400}
    %12 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%9[%c0 to %7 for %7], %11[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<*>{%7}, !stream.resource<*>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<*>{%c2016}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%c2016} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%8[%c0 to %7 for %7], %9[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.timepoint.immediate => !stream.timepoint
    %11 = stream.timepoint.immediate => !stream.timepoint
    %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %15 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.timepoint.immediate => !stream.timepoint
    %11 = stream.timepoint.immediate => !stream.timepoint
    %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %15 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %10 = stream.timepoint.immediate => !stream.timepoint
    %11 = stream.timepoint.immediate => !stream.timepoint
    %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %15 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %14 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %10 = stream.timepoint.immediate => !stream.timepoint
  %11 = stream.timepoint.immediate => !stream.timepoint
  %12 = stream.timepoint.join max(%10, %11) => !stream.timepoint
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%12) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %15 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %15 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %13 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %14 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %13 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %14 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
    %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
    stream.yield %12 : !stream.resource<external>{%c2016}
  } => !stream.timepoint
  %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump Before ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}) -> !stream.resource<external>{%c2016} {
      %12 = stream.async.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%arg2[%c0 to %7 for %7], %arg3[%c0 to %c2400 for %c2400], %c2, %c6, %c11, %c13, %0, %1, %2, %3) : (!stream.resource<external>{%7}, !stream.resource<external>{%c2400}, index, index, index, index, index, index, index, index) -> !stream.resource<external>{%c2016}
      stream.yield %12 : !stream.resource<external>{%c2016}
    } => !stream.timepoint
    %10 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2016}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %c0_0 = arith.constant 0 : index
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before PackConstantsPass (iree-stream-pack-constants) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %c0_0 = arith.constant 0 : index
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %c0_0 = arith.constant 0 : index
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
  %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg10[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9}
        %3 = flow.dispatch.workload.ordinal %arg2, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg3, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg4, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg5, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg6, %arg7, %arg8, %arg9} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index, %arg13: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13}
        %3 = flow.dispatch.workload.ordinal %arg6, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg9, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) {
        ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index, %arg13: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13}
        %3 = flow.dispatch.workload.ordinal %arg6, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg9, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) {
        ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.alignment = 2 : index, stream.values = [2 : index]}, %arg7: index {stream.alignment = 2 : index, stream.values = [6 : index]}, %arg8: index {stream.values = [11 : index]}, %arg9: index {stream.values = [13 : index]}, %arg10: index, %arg11: index, %arg12: index, %arg13: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13}
        %3 = flow.dispatch.workload.ordinal %arg6, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg9, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) {
        ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump Before PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.alignment = 2 : index, stream.values = [2 : index]}, %arg7: index {stream.alignment = 2 : index, stream.values = [6 : index]}, %arg8: index {stream.values = [11 : index]}, %arg9: index {stream.values = [13 : index]}, %arg10: index, %arg11: index, %arg12: index, %arg13: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %1 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %2 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13}
        %3 = flow.dispatch.workload.ordinal %arg6, 0 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 1 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 2 : index
        %6 = flow.dispatch.workload.ordinal %arg9, 3 : index
        %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [%3, %4, %5, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%arg10, %arg11, %arg12, %arg13} -> tensor<?x?x?x?xf32>
        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %9 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %7 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %8 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%10 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %11, %1, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %10 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0, %c0, %c0, %c2, %c6, %c11, %c13, %0, %1, %2, %3 : index, index, index, index, index, index, index, index, index, index, index) {
        ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %11 = stream.timepoint.await %10 => %result : !stream.resource<external>{%c2016}
    %12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %c32_i64 = arith.constant 32 : i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %c32_i64_0 = arith.constant 32 : i64
        %7 = arith.shli %6, %c32_i64_0 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %c32_i64_1 = arith.constant 32 : i64
        %12 = arith.shli %11, %c32_i64_1 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %c32_i64_2 = arith.constant 32 : i64
        %17 = arith.shli %16, %c32_i64_2 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %c32_i64_3 = arith.constant 32 : i64
        %22 = arith.shli %21, %c32_i64_3 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %c32_i64_4 = arith.constant 32 : i64
        %27 = arith.shli %26, %c32_i64_4 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %c32_i64_5 = arith.constant 32 : i64
        %32 = arith.shli %31, %c32_i64_5 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %c32_i64_6 = arith.constant 32 : i64
        %37 = arith.shli %36, %c32_i64_6 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %c32_i64_7 = arith.constant 32 : i64
        %42 = arith.shli %41, %c32_i64_7 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %c32_i64_8 = arith.constant 32 : i64
        %47 = arith.shli %46, %c32_i64_8 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %c32_i64_9 = arith.constant 32 : i64
        %52 = arith.shli %51, %c32_i64_9 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %c0_i64 = arith.constant 0 : i64
    %c0_i32 = arith.constant 0 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i64_1 = arith.constant 0 : i64
    %c0_i32_2 = arith.constant 0 : i32
    %c0_i64_3 = arith.constant 0 : i64
    %c0_i32_4 = arith.constant 0 : i32
    %c32_i64_5 = arith.constant 32 : i64
    %c0_i64_6 = arith.constant 0 : i64
    %c0_i32_7 = arith.constant 0 : i32
    %c0_i64_8 = arith.constant 0 : i64
    %c0_i32_9 = arith.constant 0 : i32
    %c32_i64_10 = arith.constant 32 : i64
    %c0_i64_11 = arith.constant 0 : i64
    %c0_i32_12 = arith.constant 0 : i32
    %c2_i64 = arith.constant 2 : i64
    %c2_i32 = arith.constant 2 : i32
    %c32_i64_13 = arith.constant 32 : i64
    %c0_i64_14 = arith.constant 0 : i64
    %c0_i32_15 = arith.constant 0 : i32
    %c6_i64 = arith.constant 6 : i64
    %c6_i32 = arith.constant 6 : i32
    %c32_i64_16 = arith.constant 32 : i64
    %c0_i64_17 = arith.constant 0 : i64
    %c0_i32_18 = arith.constant 0 : i32
    %c11_i64 = arith.constant 11 : i64
    %c11_i32 = arith.constant 11 : i32
    %c32_i64_19 = arith.constant 32 : i64
    %c0_i64_20 = arith.constant 0 : i64
    %c0_i32_21 = arith.constant 0 : i32
    %c13_i64 = arith.constant 13 : i64
    %c13_i32 = arith.constant 13 : i32
    %c32_i64_22 = arith.constant 32 : i64
    %c0_i64_23 = arith.constant 0 : i64
    %c0_i32_24 = arith.constant 0 : i32
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %c32_i64_25 = arith.constant 32 : i64
    %12 = arith.shrui %10, %c32_i64_25 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %c32_i64_26 = arith.constant 32 : i64
    %16 = arith.shrui %14, %c32_i64_26 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %c32_i64_27 = arith.constant 32 : i64
    %20 = arith.shrui %18, %c32_i64_27 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %c32_i64_28 = arith.constant 32 : i64
    %24 = arith.shrui %22, %c32_i64_28 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12, %c2_i32, %c0_i32_15, %c6_i32, %c0_i32_18, %c11_i32, %c0_i32_21, %c13_i32, %c0_i32_24, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %c0_0 = arith.constant 0 : index
  %c0_i64 = arith.constant 0 : i64
  %c0_i32 = arith.constant 0 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i64_1 = arith.constant 0 : i64
  %c0_i32_2 = arith.constant 0 : i32
  %c0_i64_3 = arith.constant 0 : i64
  %c0_i32_4 = arith.constant 0 : i32
  %c32_i64_5 = arith.constant 32 : i64
  %c0_i64_6 = arith.constant 0 : i64
  %c0_i32_7 = arith.constant 0 : i32
  %c0_i64_8 = arith.constant 0 : i64
  %c0_i32_9 = arith.constant 0 : i32
  %c32_i64_10 = arith.constant 32 : i64
  %c0_i64_11 = arith.constant 0 : i64
  %c0_i32_12 = arith.constant 0 : i32
  %c2_i64 = arith.constant 2 : i64
  %c2_i32 = arith.constant 2 : i32
  %c32_i64_13 = arith.constant 32 : i64
  %c0_i64_14 = arith.constant 0 : i64
  %c0_i32_15 = arith.constant 0 : i32
  %c6_i64 = arith.constant 6 : i64
  %c6_i32 = arith.constant 6 : i32
  %c32_i64_16 = arith.constant 32 : i64
  %c0_i64_17 = arith.constant 0 : i64
  %c0_i32_18 = arith.constant 0 : i32
  %c11_i64 = arith.constant 11 : i64
  %c11_i32 = arith.constant 11 : i32
  %c32_i64_19 = arith.constant 32 : i64
  %c0_i64_20 = arith.constant 0 : i64
  %c0_i32_21 = arith.constant 0 : i32
  %c13_i64 = arith.constant 13 : i64
  %c13_i32 = arith.constant 13 : i32
  %c32_i64_22 = arith.constant 32 : i64
  %c0_i64_23 = arith.constant 0 : i64
  %c0_i32_24 = arith.constant 0 : i32
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %c32_i64_25 = arith.constant 32 : i64
  %12 = arith.shrui %10, %c32_i64_25 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %c32_i64_26 = arith.constant 32 : i64
  %16 = arith.shrui %14, %c32_i64_26 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %c32_i64_27 = arith.constant 32 : i64
  %20 = arith.shrui %18, %c32_i64_27 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %c32_i64_28 = arith.constant 32 : i64
  %24 = arith.shrui %22, %c32_i64_28 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12, %c2_i32, %c0_i32_15, %c6_i32, %c0_i32_18, %c11_i32, %c0_i32_21, %c13_i32, %c0_i32_24, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0_0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0_0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0_0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %c32_i64 = arith.constant 32 : i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %c32_i64_0 = arith.constant 32 : i64
        %7 = arith.shli %6, %c32_i64_0 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %c32_i64_1 = arith.constant 32 : i64
        %12 = arith.shli %11, %c32_i64_1 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %c32_i64_2 = arith.constant 32 : i64
        %17 = arith.shli %16, %c32_i64_2 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %c32_i64_3 = arith.constant 32 : i64
        %22 = arith.shli %21, %c32_i64_3 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %c32_i64_4 = arith.constant 32 : i64
        %27 = arith.shli %26, %c32_i64_4 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %c32_i64_5 = arith.constant 32 : i64
        %32 = arith.shli %31, %c32_i64_5 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %c32_i64_6 = arith.constant 32 : i64
        %37 = arith.shli %36, %c32_i64_6 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %c32_i64_7 = arith.constant 32 : i64
        %42 = arith.shli %41, %c32_i64_7 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %c32_i64_8 = arith.constant 32 : i64
        %47 = arith.shli %46, %c32_i64_8 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %c32_i64_9 = arith.constant 32 : i64
        %52 = arith.shli %51, %c32_i64_9 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i32, %arg21: i32, %arg22: i32, %arg23: i32, %arg24: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %arg11 : i32 to i64
        %21 = arith.extui %arg12 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %arg13 : i32 to i64
        %26 = arith.extui %arg14 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %arg15 : i32 to i64
        %31 = arith.extui %arg16 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg17 : i32 to i64
        %36 = arith.extui %arg18 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg19 : i32 to i64
        %41 = arith.extui %arg20 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg21 : i32 to i64
        %46 = arith.extui %arg22 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg23 : i32 to i64
        %51 = arith.extui %arg24 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c2_i32, %c0_i32, %c6_i32, %c0_i32, %c11_i32, %c0_i32, %c13_i32, %c0_i32, %11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %c0_i32 = arith.constant 0 : i32
        %c2_i32 = arith.constant 2 : i32
        %c6_i32 = arith.constant 6 : i32
        %c11_i32 = arith.constant 11 : i32
        %c13_i32 = arith.constant 13 : i32
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %c0_i32 : i32 to i64
        %1 = arith.extui %c0_i32 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %c0_i32 : i32 to i64
        %6 = arith.extui %c0_i32 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %c0_i32 : i32 to i64
        %11 = arith.extui %c0_i32 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %c2_i32 : i32 to i64
        %16 = arith.extui %c0_i32 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %c6_i32 : i32 to i64
        %21 = arith.extui %c0_i32 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %c11_i32 : i32 to i64
        %26 = arith.extui %c0_i32 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %c13_i32 : i32 to i64
        %31 = arith.extui %c0_i32 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg3 : i32 to i64
        %36 = arith.extui %arg4 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg5 : i32 to i64
        %41 = arith.extui %arg6 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg7 : i32 to i64
        %46 = arith.extui %arg8 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg9 : i32 to i64
        %51 = arith.extui %arg10 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13_i32 = arith.constant 13 : i32
    %c11_i32 = arith.constant 11 : i32
    %c6_i32 = arith.constant 6 : i32
    %c2_i32 = arith.constant 2 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i32 = arith.constant 0 : i32
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13_i32 = arith.constant 13 : i32
  %c11_i32 = arith.constant 11 : i32
  %c6_i32 = arith.constant 6 : i32
  %c2_i32 = arith.constant 2 : i32
  %c32_i64 = arith.constant 32 : i64
  %c0_i32 = arith.constant 0 : i32
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %c0_i32 = arith.constant 0 : i32
        %c2_i32 = arith.constant 2 : i32
        %c6_i32 = arith.constant 6 : i32
        %c11_i32 = arith.constant 11 : i32
        %c13_i32 = arith.constant 13 : i32
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %c0_i32 : i32 to i64
        %1 = arith.extui %c0_i32 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %c0_i32 : i32 to i64
        %6 = arith.extui %c0_i32 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %c0_i32 : i32 to i64
        %11 = arith.extui %c0_i32 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = arith.extui %c2_i32 : i32 to i64
        %16 = arith.extui %c0_i32 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 {stream.alignment = 2 : index, stream.values = [2 : index]} : i64 to index
        %20 = arith.extui %c6_i32 : i32 to i64
        %21 = arith.extui %c0_i32 : i32 to i64
        %22 = arith.shli %21, %c32_i64 : i64
        %23 = arith.ori %20, %22 : i64
        %24 = arith.index_castui %23 {stream.alignment = 2 : index, stream.values = [6 : index]} : i64 to index
        %25 = arith.extui %c11_i32 : i32 to i64
        %26 = arith.extui %c0_i32 : i32 to i64
        %27 = arith.shli %26, %c32_i64 : i64
        %28 = arith.ori %25, %27 : i64
        %29 = arith.index_castui %28 {stream.values = [11 : index]} : i64 to index
        %30 = arith.extui %c13_i32 : i32 to i64
        %31 = arith.extui %c0_i32 : i32 to i64
        %32 = arith.shli %31, %c32_i64 : i64
        %33 = arith.ori %30, %32 : i64
        %34 = arith.index_castui %33 {stream.values = [13 : index]} : i64 to index
        %35 = arith.extui %arg3 : i32 to i64
        %36 = arith.extui %arg4 : i32 to i64
        %37 = arith.shli %36, %c32_i64 : i64
        %38 = arith.ori %35, %37 : i64
        %39 = arith.index_castui %38 : i64 to index
        %40 = arith.extui %arg5 : i32 to i64
        %41 = arith.extui %arg6 : i32 to i64
        %42 = arith.shli %41, %c32_i64 : i64
        %43 = arith.ori %40, %42 : i64
        %44 = arith.index_castui %43 : i64 to index
        %45 = arith.extui %arg7 : i32 to i64
        %46 = arith.extui %arg8 : i32 to i64
        %47 = arith.shli %46, %c32_i64 : i64
        %48 = arith.ori %45, %47 : i64
        %49 = arith.index_castui %48 : i64 to index
        %50 = arith.extui %arg9 : i32 to i64
        %51 = arith.extui %arg10 : i32 to i64
        %52 = arith.shli %51, %c32_i64 : i64
        %53 = arith.ori %50, %52 : i64
        %54 = arith.index_castui %53 : i64 to index
        %55 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %56 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %57 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54}
        %58 = flow.dispatch.workload.ordinal %19, 0 : index
        %59 = flow.dispatch.workload.ordinal %24, 1 : index
        %60 = flow.dispatch.workload.ordinal %29, 2 : index
        %61 = flow.dispatch.workload.ordinal %34, 3 : index
        %62 = flow.dispatch.tensor.load %57, offsets = [0, 0, 0, 0], sizes = [%58, %59, %60, %61], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%39, %44, %49, %54} -> tensor<?x?x?x?xf32>
        %63 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %64 = tensor.empty() : tensor<2x4x7x9xf32>
        %cast = tensor.cast %62 : tensor<?x?x?x?xf32> to tensor<2x6x11x13xf32>
        %65 = linalg.fill ins(%cst : f32) outs(%64 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %66 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%cast, %63 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%65 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
  %10 = arith.index_castui %0 : index to i64
  %11 = arith.trunci %10 : i64 to i32
  %12 = arith.shrui %10, %c32_i64 : i64
  %13 = arith.trunci %12 : i64 to i32
  %14 = arith.index_castui %1 : index to i64
  %15 = arith.trunci %14 : i64 to i32
  %16 = arith.shrui %14, %c32_i64 : i64
  %17 = arith.trunci %16 : i64 to i32
  %18 = arith.index_castui %2 : index to i64
  %19 = arith.trunci %18 : i64 to i32
  %20 = arith.shrui %18, %c32_i64 : i64
  %21 = arith.trunci %20 : i64 to i32
  %22 = arith.index_castui %3 : index to i64
  %23 = arith.trunci %22 : i64 to i32
  %24 = arith.shrui %22, %c32_i64 : i64
  %25 = arith.trunci %24 : i64 to i32
  %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
    stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
      ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
      wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
    }
  } => !stream.timepoint
  %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
  %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @main_dispatch_0 {
    stream.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %c0 = arith.constant 0 : index
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 : i64 to index
        %15 = arith.extui %arg9 : i32 to i64
        %16 = arith.extui %arg10 : i32 to i64
        %17 = arith.shli %16, %c32_i64 : i64
        %18 = arith.ori %15, %17 : i64
        %19 = arith.index_castui %18 : i64 to index
        %20 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %21 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %22 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19}
        %23 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %9, %14, %19} -> tensor<2x6x11x13xf32>
        %24 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %25 = tensor.empty() : tensor<2x4x7x9xf32>
        %26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %27 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%23, %24 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%26 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %27, %21, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
          %c0 = arith.constant 0 : index
          %c32_i64 = arith.constant 32 : i64
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = arith.extui %0 : i32 to i64
          %9 = arith.extui %1 : i32 to i64
          %10 = arith.shli %9, %c32_i64 : i64
          %11 = arith.ori %8, %10 : i64
          %12 = arith.index_castui %11 : i64 to index
          %13 = arith.extui %2 : i32 to i64
          %14 = arith.extui %3 : i32 to i64
          %15 = arith.shli %14, %c32_i64 : i64
          %16 = arith.ori %13, %15 : i64
          %17 = arith.index_castui %16 : i64 to index
          %18 = arith.extui %4 : i32 to i64
          %19 = arith.extui %5 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %6 : i32 to i64
          %24 = arith.extui %7 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
          %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
          %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
          %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
          %33 = tensor.empty() : tensor<2x4x7x9xf32>
          %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          return
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
          %c0 = arith.constant 0 : index
          %c32_i64 = arith.constant 32 : i64
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = arith.extui %0 : i32 to i64
          %9 = arith.extui %1 : i32 to i64
          %10 = arith.shli %9, %c32_i64 : i64
          %11 = arith.ori %8, %10 : i64
          %12 = arith.index_castui %11 : i64 to index
          %13 = arith.extui %2 : i32 to i64
          %14 = arith.extui %3 : i32 to i64
          %15 = arith.shli %14, %c32_i64 : i64
          %16 = arith.ori %13, %15 : i64
          %17 = arith.index_castui %16 : i64 to index
          %18 = arith.extui %4 : i32 to i64
          %19 = arith.extui %5 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %6 : i32 to i64
          %24 = arith.extui %7 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
          %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
          %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
          %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
          %33 = tensor.empty() : tensor<2x4x7x9xf32>
          %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          return
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
          %c0 = arith.constant 0 : index
          %c32_i64 = arith.constant 32 : i64
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = arith.extui %0 : i32 to i64
          %9 = arith.extui %1 : i32 to i64
          %10 = arith.shli %9, %c32_i64 : i64
          %11 = arith.ori %8, %10 : i64
          %12 = arith.index_castui %11 : i64 to index
          %13 = arith.extui %2 : i32 to i64
          %14 = arith.extui %3 : i32 to i64
          %15 = arith.shli %14, %c32_i64 : i64
          %16 = arith.ori %13, %15 : i64
          %17 = arith.index_castui %16 : i64 to index
          %18 = arith.extui %4 : i32 to i64
          %19 = arith.extui %5 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %6 : i32 to i64
          %24 = arith.extui %7 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
          %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
          %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
          %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
          %33 = tensor.empty() : tensor<2x4x7x9xf32>
          %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
          flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
          return
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump Before ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
        %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
        %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
        %8 = arith.extui %0 : i32 to i64
        %9 = arith.extui %1 : i32 to i64
        %10 = arith.shli %9, %c32_i64 : i64
        %11 = arith.ori %8, %10 : i64
        %12 = arith.index_castui %11 : i64 to index
        %13 = arith.extui %2 : i32 to i64
        %14 = arith.extui %3 : i32 to i64
        %15 = arith.shli %14, %c32_i64 : i64
        %16 = arith.ori %13, %15 : i64
        %17 = arith.index_castui %16 : i64 to index
        %18 = arith.extui %4 : i32 to i64
        %19 = arith.extui %5 : i32 to i64
        %20 = arith.shli %19, %c32_i64 : i64
        %21 = arith.ori %18, %20 : i64
        %22 = arith.index_castui %21 : i64 to index
        %23 = arith.extui %6 : i32 to i64
        %24 = arith.extui %7 : i32 to i64
        %25 = arith.shli %24, %c32_i64 : i64
        %26 = arith.ori %23, %25 : i64
        %27 = arith.index_castui %26 : i64 to index
        %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
        %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
        %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %33 = tensor.empty() : tensor<2x4x7x9xf32>
        %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
}

// -----// IR Dump Before ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
      %c0 = arith.constant 0 : index
      %c32_i64 = arith.constant 32 : i64
      %cst = arith.constant 0.000000e+00 : f32
      %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
      %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
      %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
      %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
      %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
      %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
      %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
      %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
      %8 = arith.extui %0 : i32 to i64
      %9 = arith.extui %1 : i32 to i64
      %10 = arith.shli %9, %c32_i64 : i64
      %11 = arith.ori %8, %10 : i64
      %12 = arith.index_castui %11 : i64 to index
      %13 = arith.extui %2 : i32 to i64
      %14 = arith.extui %3 : i32 to i64
      %15 = arith.shli %14, %c32_i64 : i64
      %16 = arith.ori %13, %15 : i64
      %17 = arith.index_castui %16 : i64 to index
      %18 = arith.extui %4 : i32 to i64
      %19 = arith.extui %5 : i32 to i64
      %20 = arith.shli %19, %c32_i64 : i64
      %21 = arith.ori %18, %20 : i64
      %22 = arith.index_castui %21 : i64 to index
      %23 = arith.extui %6 : i32 to i64
      %24 = arith.extui %7 : i32 to i64
      %25 = arith.shli %24, %c32_i64 : i64
      %26 = arith.ori %23, %25 : i64
      %27 = arith.index_castui %26 : i64 to index
      %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
      %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x4x7x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump Before TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump Before RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %c0 = arith.constant 0 : index
      %c32_i64 = arith.constant 32 : i64
      %cst = arith.constant 0.000000e+00 : f32
      %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
      %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
      %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
      %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
      %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
      %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
      %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
      %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
      %8 = arith.extui %0 : i32 to i64
      %9 = arith.extui %1 : i32 to i64
      %10 = arith.shli %9, %c32_i64 : i64
      %11 = arith.ori %8, %10 : i64
      %12 = arith.index_castui %11 : i64 to index
      %13 = arith.extui %2 : i32 to i64
      %14 = arith.extui %3 : i32 to i64
      %15 = arith.shli %14, %c32_i64 : i64
      %16 = arith.ori %13, %15 : i64
      %17 = arith.index_castui %16 : i64 to index
      %18 = arith.extui %4 : i32 to i64
      %19 = arith.extui %5 : i32 to i64
      %20 = arith.shli %19, %c32_i64 : i64
      %21 = arith.ori %18, %20 : i64
      %22 = arith.index_castui %21 : i64 to index
      %23 = arith.extui %6 : i32 to i64
      %24 = arith.extui %7 : i32 to i64
      %25 = arith.shli %24, %c32_i64 : i64
      %26 = arith.ori %23, %25 : i64
      %27 = arith.index_castui %26 : i64 to index
      %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
      %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x4x7x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
        %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
        %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
        %8 = arith.extui %0 : i32 to i64
        %9 = arith.extui %1 : i32 to i64
        %10 = arith.shli %9, %c32_i64 : i64
        %11 = arith.ori %8, %10 : i64
        %12 = arith.index_castui %11 : i64 to index
        %13 = arith.extui %2 : i32 to i64
        %14 = arith.extui %3 : i32 to i64
        %15 = arith.shli %14, %c32_i64 : i64
        %16 = arith.ori %13, %15 : i64
        %17 = arith.index_castui %16 : i64 to index
        %18 = arith.extui %4 : i32 to i64
        %19 = arith.extui %5 : i32 to i64
        %20 = arith.shli %19, %c32_i64 : i64
        %21 = arith.ori %18, %20 : i64
        %22 = arith.index_castui %21 : i64 to index
        %23 = arith.extui %6 : i32 to i64
        %24 = arith.extui %7 : i32 to i64
        %25 = arith.shli %24, %c32_i64 : i64
        %26 = arith.ori %23, %25 : i64
        %27 = arith.index_castui %26 : i64 to index
        %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
        %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
        %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %33 = tensor.empty() : tensor<2x4x7x9xf32>
        %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
}

// -----// IR Dump Before TranslateExecutablesPass (iree-hal-translate-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
        %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
        %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
        %8 = arith.extui %0 : i32 to i64
        %9 = arith.extui %1 : i32 to i64
        %10 = arith.shli %9, %c32_i64 : i64
        %11 = arith.ori %8, %10 : i64
        %12 = arith.index_castui %11 : i64 to index
        %13 = arith.extui %2 : i32 to i64
        %14 = arith.extui %3 : i32 to i64
        %15 = arith.shli %14, %c32_i64 : i64
        %16 = arith.ori %13, %15 : i64
        %17 = arith.index_castui %16 : i64 to index
        %18 = arith.extui %4 : i32 to i64
        %19 = arith.extui %5 : i32 to i64
        %20 = arith.shli %19, %c32_i64 : i64
        %21 = arith.ori %18, %20 : i64
        %22 = arith.index_castui %21 : i64 to index
        %23 = arith.extui %6 : i32 to i64
        %24 = arith.extui %7 : i32 to i64
        %25 = arith.shli %24, %c32_i64 : i64
        %26 = arith.ori %23, %25 : i64
        %27 = arith.index_castui %26 : i64 to index
        %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
        %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
        %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
        %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
        %33 = tensor.empty() : tensor<2x4x7x9xf32>
        %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
        flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
        return
      }
    }
  }
}

// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %c0 = arith.constant 0 : index
      %c32_i64 = arith.constant 32 : i64
      %cst = arith.constant 0.000000e+00 : f32
      %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
      %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
      %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
      %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
      %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
      %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
      %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
      %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
      %8 = arith.extui %0 : i32 to i64
      %9 = arith.extui %1 : i32 to i64
      %10 = arith.shli %9, %c32_i64 : i64
      %11 = arith.ori %8, %10 : i64
      %12 = arith.index_castui %11 : i64 to index
      %13 = arith.extui %2 : i32 to i64
      %14 = arith.extui %3 : i32 to i64
      %15 = arith.shli %14, %c32_i64 : i64
      %16 = arith.ori %13, %15 : i64
      %17 = arith.index_castui %16 : i64 to index
      %18 = arith.extui %4 : i32 to i64
      %19 = arith.extui %5 : i32 to i64
      %20 = arith.shli %19, %c32_i64 : i64
      %21 = arith.ori %18, %20 : i64
      %22 = arith.index_castui %21 : i64 to index
      %23 = arith.extui %6 : i32 to i64
      %24 = arith.extui %7 : i32 to i64
      %25 = arith.shli %24, %c32_i64 : i64
      %26 = arith.ori %23, %25 : i64
      %27 = arith.index_castui %26 : i64 to index
      %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
      %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x4x7x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
      return
    }
  }
}

// -----// IR Dump Before LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
    %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
    %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
    %33 = tensor.empty() : tensor<2x4x7x9xf32>
    %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
    flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    return
  }
}

// -----// IR Dump Before LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump Before TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0, 0], sizes = [2, 6, 11, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x11x13xf32>
  %32 = flow.dispatch.tensor.load %28, offsets = [0, 0, 0, 0], sizes = [4, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<4x6x5x5xf32>
  %33 = tensor.empty() : tensor<2x4x7x9xf32>
  %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x11x13xf32>, tensor<4x6x5x5xf32>) outs(%34 : tensor<2x4x7x9xf32>) -> tensor<2x4x7x9xf32>
  flow.dispatch.tensor.store %35, %29, offsets = [0, 0, 0, 0], sizes = [2, 4, 7, 9], strides = [1, 1, 1, 1] : tensor<2x4x7x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  return
}

// -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x1x1x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x1x1x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x1x1x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %32 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %33 = tensor.empty() : tensor<2x1x1x9xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%33 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%31, %32 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x?xf32>
      %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before FoldAffineMinInDistributedLoopsPass (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x?xf32>
      %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After FoldAffineMinInDistributedLoopsPass (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x?xf32>
      %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c13 = arith.constant 13 : index
  %c9 = arith.constant 9 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x?xf32>
      %cast = tensor.cast %31 : tensor<2x1x1x?xf32> to tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, %c0], sizes = [2, 6, 5, %c13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x?xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%cast : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x?xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %cast_0 = tensor.cast %35 : tensor<2x1x1x9xf32> to tensor<2x1x1x?xf32>
      flow.dispatch.tensor.store %cast_0, %29, offsets = [0, %arg0, %arg1, %c0], sizes = [2, 1, 1, %c9], strides = [1, 1, 1, 1] : tensor<2x1x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = linalg.fill ins(%cst : f32) outs(%31 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      %35 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%32, %33 : tensor<2x6x5x13xf32>, tensor<1x6x5x5xf32>) outs(%34 : tensor<2x1x1x9xf32>) -> tensor<2x1x1x9xf32>
      flow.dispatch.tensor.store %35, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice, %33 : tensor<1x6x5x7xf32>, tensor<1x6x5x5xf32>) outs(%36 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
              scf.yield %39 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
              scf.yield %39 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
              scf.yield %39 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %39 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 9, 0, 0, 0], [1, 1, 1, 3, 0, 0, 0], [0, 0, 0, 0, 6, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>, strides = dense<1> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x6x1x3xf32>, tensor<1x6x1x1xf32>) outs(%arg9 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
              scf.yield %39 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_0 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x1x1x3xf32>) -> tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_2 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_5 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x6x3xf32>, tensor<1x6x1xf32>) outs(%extracted_slice_5 : tensor<1x1x3xf32>) -> tensor<1x1x3xf32>
              %inserted_slice_6 = tensor.insert_slice %39 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_6 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_6 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %40 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x3xf32>, vector<1x1x3xf32>
              %42 = vector.transpose %40, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %43 = vector.extract %42[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %43, %41 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              %45 = vector.transfer_write %44, %extracted_slice_6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
              %inserted_slice_7 = tensor.insert_slice %45 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_7 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %31 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %33 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %31) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %32[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %36) -> (tensor<1x1x1x3xf32>) {
            %38 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %33[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %extracted_slice_6 = tensor.extract_slice %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x3xf32>
              %39 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %40 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x3xf32>, vector<1x1x3xf32>
              %42 = vector.transpose %40, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %43 = vector.extract %42[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %43, %41 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              %45 = vector.transfer_write %44, %extracted_slice_6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
              %inserted_slice_7 = tensor.insert_slice %45 into %arg9[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3xf32> into tensor<1x1x1x3xf32>
              scf.yield %inserted_slice_7 : tensor<1x1x1x3xf32>
            }
            scf.yield %38 : tensor<1x1x1x3xf32>
          }
          %inserted_slice = tensor.insert_slice %37 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x3xf32>, tensor<1x1x1x3xf32>
          %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x1x3xf32>, vector<1x1x3xf32>
          %38 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %37) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_3 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x3xf32>, vector<1x6x3xf32>
              %42 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x6x1xf32>, vector<1x6x1xf32>
              %43 = vector.transpose %42, [2, 1, 0] : vector<1x6x1xf32> to vector<1x6x1xf32>
              %44 = vector.extract %43[0] : vector<6x1xf32> from vector<1x6x1xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %44, %arg9 : vector<1x6x3xf32>, vector<6x1xf32> into vector<1x1x3xf32>
              scf.yield %45 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump Before IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %31 = flow.dispatch.tensor.load %28, offsets = [%arg0, 0, 0, 0], sizes = [1, 6, 5, 5], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x6x5x5xf32>> -> tensor<1x6x5x5xf32>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %32 = flow.dispatch.tensor.load %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>> -> tensor<2x1x1x9xf32>
      %33 = flow.dispatch.tensor.load %30, offsets = [0, 0, %arg1, 0], sizes = [2, 6, 5, 13], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%12, %17, %22, %27} -> tensor<2x6x5x13xf32>
      %34 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %32) -> (tensor<2x1x1x9xf32>) {
        %35 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (tensor<2x1x1x9xf32>) {
          %extracted_slice = tensor.extract_slice %33[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : tensor<2x6x5x13xf32> to tensor<1x6x5x7xf32>
          %extracted_slice_2 = tensor.extract_slice %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<2x1x1x9xf32> to tensor<1x1x1x3xf32>
          %36 = vector.transfer_write %cst_0, %extracted_slice_2[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %37 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %40 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x5x7xf32> to tensor<1x6x1x3xf32>
              %extracted_slice_4 = tensor.extract_slice %31[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x5x5xf32> to tensor<1x6x1x1xf32>
              %extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : tensor<1x6x1x3xf32> to tensor<1x6x3xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<1x6x1xf32>
              %41 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x3xf32>, vector<6x3xf32>
              %42 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<1x6x1xf32>, vector<6x1xf32>
              %43 = vector.shape_cast %42 : vector<6x1xf32> to vector<6xf32>
              %44 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %45 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %41, %43, %44 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %46 = vector.broadcast %45 : vector<3xf32> to vector<1x3xf32>
              %47 = vector.broadcast %46 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %47 : vector<1x1x3xf32>
            }
            scf.yield %40 : vector<1x1x3xf32>
          }
          %38 = vector.extract %37[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %39 = vector.transfer_write %38, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, tensor<1x1x1x3xf32>
          %inserted_slice = tensor.insert_slice %39 into %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<2x1x1x9xf32>
          scf.yield %inserted_slice : tensor<2x1x1x9xf32>
        }
        scf.yield %35 : tensor<2x1x1x9xf32>
      }
      flow.dispatch.tensor.store %34, %29, offsets = [0, %arg0, %arg1, 0], sizes = [2, 1, 1, 9], strides = [1, 1, 1, 1] : tensor<2x1x1x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x4x7x9xf32>>
    }
  }
  return
}

// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32>
              %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32>
              %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %42 : vector<1x1x3xf32>
            }
            scf.yield %35 : vector<1x1x3xf32>
          }
          %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
          scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
        scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32>
              %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32>
              %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %42 : vector<1x1x3xf32>
            }
            scf.yield %35 : vector<1x1x3xf32>
          }
          %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
          scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
        scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32>
              %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32>
              %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %42 : vector<1x1x3xf32>
            }
            scf.yield %35 : vector<1x1x3xf32>
          }
          %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
          scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
        scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %subview_2) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %32 = scf.for %arg4 = %c0 to %c9 step %c3 iter_args(%arg5 = %arg3) -> (memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg4] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %cst) -> (vector<1x1x3xf32>) {
            %35 = scf.for %arg8 = %c0 to %c5 step %c1 iter_args(%arg9 = %arg7) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg6, %arg8] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg6, %arg8] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %36 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %37 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %38 = vector.shape_cast %37 : vector<6x1xf32> to vector<6xf32>
              %39 = vector.extract %arg9[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %40 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %36, %38, %39 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %41 = vector.broadcast %40 : vector<3xf32> to vector<1x3xf32>
              %42 = vector.broadcast %41 : vector<1x3xf32> to vector<1x1x3xf32>
              scf.yield %42 : vector<1x1x3xf32>
            }
            scf.yield %35 : vector<1x1x3xf32>
          }
          %34 = vector.extract %33[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %34, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %arg5[%arg2, 0, 0, %arg4] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
          scf.yield %arg5 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
        scf.yield %32 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
        }
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_5 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_8 = memref.subview %subview_5[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_10 = memref.subview %subview_8[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_11 = memref.subview %subview_9[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_11[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
        }
      }
      %subview_4 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
        }
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_5 : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          }
        }
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
  }
  return
}

// -----// IR Dump Before CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
  }
  return
}

// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
  }
  return
}

// -----// IR Dump Before RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c4 step %workgroup_count_y {
    %subview = memref.subview %28[%arg0, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.for %arg1 = %workgroup_id_x to %c7 step %workgroup_count_x {
      %subview_2 = memref.subview %29[0, %arg0, %arg1, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %30[0, 0, %arg1, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg2 = %c0 to %c2 step %c1 {
        scf.for %arg3 = %c0 to %c9 step %c3 {
          %subview_4 = memref.subview %subview_3[%arg2, 0, 0, %arg3] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_5 = memref.subview %subview_2[%arg2, 0, 0, %arg3] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %31 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %cst) -> (vector<1x1x3xf32>) {
            %33 = scf.for %arg6 = %c0 to %c5 step %c1 iter_args(%arg7 = %arg5) -> (vector<1x1x3xf32>) {
              %subview_6 = memref.subview %subview_4[0, 0, %arg4, %arg6] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_7 = memref.subview %subview[0, 0, %arg4, %arg6] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
              %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
              %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
              %37 = vector.extract %arg7[0, 0] : vector<3xf32> from vector<1x1x3xf32>
              %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
              %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
              scf.yield %39 : vector<1x1x3xf32>
            }
            scf.yield %33 : vector<1x1x3xf32>
          }
          %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
  }
  return
}

// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_3 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_4 = memref.subview %subview_3[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_4[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
          %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
          %37 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
          %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %39 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUDropVectorUnitDimsPass (iree-llvmcpu-drop-vector-unit-dims) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_1 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_3 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_4 = memref.subview %subview_3[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst_0, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_4[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %35 = vector.transfer_read %subview_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<6x1xf32>
          %36 = vector.shape_cast %35 : vector<6x1xf32> to vector<6xf32>
          %37 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %38 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %36, %37 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
          %39 = vector.broadcast %38 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %39 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      vector.transfer_write %32, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<3xf32>, memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUDropVectorUnitDimsPass (iree-llvmcpu-drop-vector-unit-dims) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.transfer_read %subview_11[%c0], %cst_0 {in_bounds = [true]} : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<6xf32>
          %36 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %37 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %35, %36 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
          %38 = vector.broadcast %37 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %38 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.transfer_read %subview_11[%c0], %cst_0 {in_bounds = [true]} : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<6xf32>
          %36 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %37 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %34, %35, %36 : vector<6x3xf32>, vector<6xf32> into vector<3xf32>
          %38 = vector.broadcast %37 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %38 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32>
          %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %38 = vector.broadcast %37 : f32 to vector<3xf32>
          %39 = vector.fma %36, %38, %35 : vector<3xf32>
          %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32>
          %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %40, %42, %39 : vector<3xf32>
          %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32>
          %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %46 = vector.broadcast %45 : f32 to vector<3xf32>
          %47 = vector.fma %44, %46, %43 : vector<3xf32>
          %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32>
          %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %50 = vector.broadcast %49 : f32 to vector<3xf32>
          %51 = vector.fma %48, %50, %47 : vector<3xf32>
          %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32>
          %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %52, %54, %51 : vector<3xf32>
          %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32>
          %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %58 = vector.broadcast %57 : f32 to vector<3xf32>
          %59 = vector.fma %56, %58, %55 : vector<3xf32>
          %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %60 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32>
          %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %38 = vector.broadcast %37 : f32 to vector<3xf32>
          %39 = vector.fma %36, %38, %35 : vector<3xf32>
          %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32>
          %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %40, %42, %39 : vector<3xf32>
          %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32>
          %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %46 = vector.broadcast %45 : f32 to vector<3xf32>
          %47 = vector.fma %44, %46, %43 : vector<3xf32>
          %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32>
          %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %50 = vector.broadcast %49 : f32 to vector<3xf32>
          %51 = vector.fma %48, %50, %47 : vector<3xf32>
          %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32>
          %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %52, %54, %51 : vector<3xf32>
          %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32>
          %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %58 = vector.broadcast %57 : f32 to vector<3xf32>
          %59 = vector.fma %56, %58, %55 : vector<3xf32>
          %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %60 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32>
          %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %38 = vector.broadcast %37 : f32 to vector<3xf32>
          %39 = vector.fma %36, %38, %35 : vector<3xf32>
          %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32>
          %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %40, %42, %39 : vector<3xf32>
          %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32>
          %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %46 = vector.broadcast %45 : f32 to vector<3xf32>
          %47 = vector.fma %44, %46, %43 : vector<3xf32>
          %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32>
          %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %50 = vector.broadcast %49 : f32 to vector<3xf32>
          %51 = vector.fma %48, %50, %47 : vector<3xf32>
          %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32>
          %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %52, %54, %51 : vector<3xf32>
          %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32>
          %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %58 = vector.broadcast %57 : f32 to vector<3xf32>
          %59 = vector.fma %56, %58, %55 : vector<3xf32>
          %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %60 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUVectorTransferLoweringPass (iree-llvmcpu-vector-transfer-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_3 = memref.subview %subview_2[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_6 = memref.subview %subview_3[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<6x3xf32>
          %subview_11 = memref.subview %subview_9[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %35 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %36 = vector.extract %34[0] : vector<3xf32> from vector<6x3xf32>
          %37 = memref.load %subview_11[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %38 = vector.broadcast %37 : f32 to vector<3xf32>
          %39 = vector.fma %36, %38, %35 : vector<3xf32>
          %40 = vector.extract %34[1] : vector<3xf32> from vector<6x3xf32>
          %41 = memref.load %subview_11[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %40, %42, %39 : vector<3xf32>
          %44 = vector.extract %34[2] : vector<3xf32> from vector<6x3xf32>
          %45 = memref.load %subview_11[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %46 = vector.broadcast %45 : f32 to vector<3xf32>
          %47 = vector.fma %44, %46, %43 : vector<3xf32>
          %48 = vector.extract %34[3] : vector<3xf32> from vector<6x3xf32>
          %49 = memref.load %subview_11[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %50 = vector.broadcast %49 : f32 to vector<3xf32>
          %51 = vector.fma %48, %50, %47 : vector<3xf32>
          %52 = vector.extract %34[4] : vector<3xf32> from vector<6x3xf32>
          %53 = memref.load %subview_11[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %52, %54, %51 : vector<3xf32>
          %56 = vector.extract %34[5] : vector<3xf32> from vector<6x3xf32>
          %57 = memref.load %subview_11[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %58 = vector.broadcast %57 : f32 to vector<3xf32>
          %59 = vector.fma %56, %58, %55 : vector<3xf32>
          %60 = vector.broadcast %59 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %60 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %32, %subview_5[%c0] {in_bounds = [true]} : vector<3xf32>, memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUVectorTransferLoweringPass (iree-llvmcpu-vector-transfer-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32, #hal.descriptor_type<storage_buffer>> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>, #hal.descriptor_type<storage_buffer>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>, #hal.descriptor_type<storage_buffer>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c4 = arith.constant 4 : index
    %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
    %c5 = arith.constant 5 : index
    %c3 = arith.constant 3 : index
    %c9 = arith.constant 9 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
    memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
    memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
    memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
    %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
    %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
    scf.for %arg0 = %c0 to %c2 step %c1 {
      scf.for %arg1 = %c0 to %c9 step %c3 {
        %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
        %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
        %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
          %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
            %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
            %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
            %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
            %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
            %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
            %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
            %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %42 = vector.broadcast %41 : f32 to vector<3xf32>
            %43 = vector.fma %34, %42, %40 : vector<3xf32>
            %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %45 = vector.broadcast %44 : f32 to vector<3xf32>
            %46 = vector.fma %35, %45, %43 : vector<3xf32>
            %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %48 = vector.broadcast %47 : f32 to vector<3xf32>
            %49 = vector.fma %36, %48, %46 : vector<3xf32>
            %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %51 = vector.broadcast %50 : f32 to vector<3xf32>
            %52 = vector.fma %37, %51, %49 : vector<3xf32>
            %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %54 = vector.broadcast %53 : f32 to vector<3xf32>
            %55 = vector.fma %38, %54, %52 : vector<3xf32>
            %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %57 = vector.broadcast %56 : f32 to vector<3xf32>
            %58 = vector.fma %39, %57, %55 : vector<3xf32>
            %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
            scf.yield %59 : vector<1x1x3xf32>
          }
          scf.yield %33 : vector<1x1x3xf32>
        }
        %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
        %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
      }
    }
    return
  }
}

// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c4 = arith.constant 4 : index
    %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
    %c5 = arith.constant 5 : index
    %c3 = arith.constant 3 : index
    %c9 = arith.constant 9 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
    memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
    memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
    memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
    %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
    %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
    scf.for %arg0 = %c0 to %c2 step %c1 {
      scf.for %arg1 = %c0 to %c9 step %c3 {
        %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
        %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
        %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
          %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
            %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
            %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
            %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
            %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
            %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
            %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
            %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %42 = vector.broadcast %41 : f32 to vector<3xf32>
            %43 = vector.fma %34, %42, %40 : vector<3xf32>
            %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %45 = vector.broadcast %44 : f32 to vector<3xf32>
            %46 = vector.fma %35, %45, %43 : vector<3xf32>
            %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %48 = vector.broadcast %47 : f32 to vector<3xf32>
            %49 = vector.fma %36, %48, %46 : vector<3xf32>
            %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %51 = vector.broadcast %50 : f32 to vector<3xf32>
            %52 = vector.fma %37, %51, %49 : vector<3xf32>
            %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %54 = vector.broadcast %53 : f32 to vector<3xf32>
            %55 = vector.fma %38, %54, %52 : vector<3xf32>
            %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %57 = vector.broadcast %56 : f32 to vector<3xf32>
            %58 = vector.fma %39, %57, %55 : vector<3xf32>
            %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
            scf.yield %59 : vector<1x1x3xf32>
          }
          scf.yield %33 : vector<1x1x3xf32>
        }
        %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
        %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
      }
    }
    return
  }
}

// -----// IR Dump Before LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before OneShotBufferize (one-shot-bufferize) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c4 = arith.constant 4 : index
    %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
    %c5 = arith.constant 5 : index
    %c3 = arith.constant 3 : index
    %c9 = arith.constant 9 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
    memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
    memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
    memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
    %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
    %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
    scf.for %arg0 = %c0 to %c2 step %c1 {
      scf.for %arg1 = %c0 to %c9 step %c3 {
        %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
        %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
        %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
          %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
            %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
            %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
            %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
            %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
            %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
            %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
            %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %42 = vector.broadcast %41 : f32 to vector<3xf32>
            %43 = vector.fma %34, %42, %40 : vector<3xf32>
            %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %45 = vector.broadcast %44 : f32 to vector<3xf32>
            %46 = vector.fma %35, %45, %43 : vector<3xf32>
            %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %48 = vector.broadcast %47 : f32 to vector<3xf32>
            %49 = vector.fma %36, %48, %46 : vector<3xf32>
            %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %51 = vector.broadcast %50 : f32 to vector<3xf32>
            %52 = vector.fma %37, %51, %49 : vector<3xf32>
            %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %54 = vector.broadcast %53 : f32 to vector<3xf32>
            %55 = vector.fma %38, %54, %52 : vector<3xf32>
            %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %57 = vector.broadcast %56 : f32 to vector<3xf32>
            %58 = vector.fma %39, %57, %55 : vector<3xf32>
            %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
            scf.yield %59 : vector<1x1x3xf32>
          }
          scf.yield %33 : vector<1x1x3xf32>
        }
        %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
        %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
      }
    }
    return
  }
}

// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c4 = arith.constant 4 : index
    %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
    %c5 = arith.constant 5 : index
    %c3 = arith.constant 3 : index
    %c9 = arith.constant 9 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
    memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
    memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
    memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
    %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
    %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
    scf.for %arg0 = %c0 to %c2 step %c1 {
      scf.for %arg1 = %c0 to %c9 step %c3 {
        %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
        %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
        %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
          %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
            %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
            %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
            %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
            %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
            %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
            %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
            %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
            %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %42 = vector.broadcast %41 : f32 to vector<3xf32>
            %43 = vector.fma %34, %42, %40 : vector<3xf32>
            %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %45 = vector.broadcast %44 : f32 to vector<3xf32>
            %46 = vector.fma %35, %45, %43 : vector<3xf32>
            %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %48 = vector.broadcast %47 : f32 to vector<3xf32>
            %49 = vector.fma %36, %48, %46 : vector<3xf32>
            %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %51 = vector.broadcast %50 : f32 to vector<3xf32>
            %52 = vector.fma %37, %51, %49 : vector<3xf32>
            %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %54 = vector.broadcast %53 : f32 to vector<3xf32>
            %55 = vector.fma %38, %54, %52 : vector<3xf32>
            %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
            %57 = vector.broadcast %56 : f32 to vector<3xf32>
            %58 = vector.fma %39, %57, %55 : vector<3xf32>
            %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
            scf.yield %59 : vector<1x1x3xf32>
          }
          scf.yield %33 : vector<1x1x3xf32>
        }
        %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
        %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
      }
    }
    return
  }
}

// -----// IR Dump Before FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before ConvertComplexToStandard (convert-complex-to-standard) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After ConvertComplexToStandard (convert-complex-to-standard) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c9 step %c3 {
      %subview_2 = memref.subview %subview_1[%arg0, 0, 0, %arg1] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
      %subview_3 = memref.subview %subview_0[%arg0, 0, 0, %arg1] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
      %31 = scf.for %arg2 = %c0 to %c5 step %c1 iter_args(%arg3 = %cst) -> (vector<1x1x3xf32>) {
        %33 = scf.for %arg4 = %c0 to %c5 step %c1 iter_args(%arg5 = %arg3) -> (vector<1x1x3xf32>) {
          %subview_5 = memref.subview %subview_2[0, 0, %arg2, %arg4] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
          %subview_6 = memref.subview %subview[0, 0, %arg2, %arg4] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
          %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
          %subview_8 = memref.subview %subview_6[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
          %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
          %34 = vector.load %subview_9[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %35 = vector.load %subview_9[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %36 = vector.load %subview_9[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %37 = vector.load %subview_9[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %38 = vector.load %subview_9[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %39 = vector.load %subview_9[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
          %subview_10 = memref.subview %subview_8[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %40 = vector.extract %arg5[0, 0] : vector<3xf32> from vector<1x1x3xf32>
          %41 = memref.load %subview_10[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %42 = vector.broadcast %41 : f32 to vector<3xf32>
          %43 = vector.fma %34, %42, %40 : vector<3xf32>
          %44 = memref.load %subview_10[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %45 = vector.broadcast %44 : f32 to vector<3xf32>
          %46 = vector.fma %35, %45, %43 : vector<3xf32>
          %47 = memref.load %subview_10[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %48 = vector.broadcast %47 : f32 to vector<3xf32>
          %49 = vector.fma %36, %48, %46 : vector<3xf32>
          %50 = memref.load %subview_10[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %51 = vector.broadcast %50 : f32 to vector<3xf32>
          %52 = vector.fma %37, %51, %49 : vector<3xf32>
          %53 = memref.load %subview_10[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %54 = vector.broadcast %53 : f32 to vector<3xf32>
          %55 = vector.fma %38, %54, %52 : vector<3xf32>
          %56 = memref.load %subview_10[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
          %57 = vector.broadcast %56 : f32 to vector<3xf32>
          %58 = vector.fma %39, %57, %55 : vector<3xf32>
          %59 = vector.broadcast %58 : vector<3xf32> to vector<1x1x3xf32>
          scf.yield %59 : vector<1x1x3xf32>
        }
        scf.yield %33 : vector<1x1x3xf32>
      }
      %32 = vector.extract %31[0, 0] : vector<3xf32> from vector<1x1x3xf32>
      %subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      vector.store %32, %subview_4[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
    }
  }
  return
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb11
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2, ^bb12
^bb2:  // pred: ^bb1
  cf.br ^bb3(%c0 : index)
^bb3(%33: index):  // 2 preds: ^bb2, ^bb10
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb4, ^bb11
^bb4:  // pred: ^bb3
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb5(%c0, %cst : index, vector<1x1x3xf32>)
^bb5(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb9
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb6, ^bb10
^bb6:  // pred: ^bb5
  cf.br ^bb7(%c0, %36 : index, vector<1x1x3xf32>)
^bb7(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb6, ^bb8
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb7(%67, %66 : index, vector<1x1x3xf32>)
^bb9:  // pred: ^bb7
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb5(%68, %39 : index, vector<1x1x3xf32>)
^bb10:  // pred: ^bb5
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb3(%70 : index)
^bb11:  // pred: ^bb3
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb12:  // pred: ^bb1
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb11
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2, ^bb12
^bb2:  // pred: ^bb1
  cf.br ^bb3(%c0 : index)
^bb3(%33: index):  // 2 preds: ^bb2, ^bb10
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb4, ^bb11
^bb4:  // pred: ^bb3
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb5(%c0, %cst : index, vector<1x1x3xf32>)
^bb5(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb9
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb6, ^bb10
^bb6:  // pred: ^bb5
  cf.br ^bb7(%c0, %36 : index, vector<1x1x3xf32>)
^bb7(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb6, ^bb8
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb7(%67, %66 : index, vector<1x1x3xf32>)
^bb9:  // pred: ^bb7
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb5(%68, %39 : index, vector<1x1x3xf32>)
^bb10:  // pred: ^bb5
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb3(%70 : index)
^bb11:  // pred: ^bb3
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb12:  // pred: ^bb1
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before ArithExpandOpsPass (arith-expand) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After ArithExpandOpsPass (arith-expand) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before ExpandOps (memref-expand) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After ExpandOps (memref-expand) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %28[%workgroup_id_y, 0, 0, 0] [1, 6, 5, 5] [1, 1, 1, 1] : memref<4x6x5x5xf32> to memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_0 = memref.subview %29[0, %workgroup_id_y, %workgroup_id_x, 0] [2, 1, 1, 9] [1, 1, 1, 1] : memref<2x4x7x9xf32> to memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>>
  %subview_1 = memref.subview %30[0, 0, %workgroup_id_x, 0] [2, 6, 5, 13] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>>
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  %subview_2 = memref.subview %subview_1[%31, 0, 0, %33] [1, 6, 5, 7] [1, 1, 1, 1] : memref<2x6x5x13xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_3 = memref.subview %subview_0[%31, 0, 0, %33] [1, 1, 1, 3] [1, 1, 1, 1] : memref<2x1x1x9xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>>
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %subview_4 = memref.subview %subview_2[0, 0, %35, %38] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x5x7xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>>
  %subview_5 = memref.subview %subview[0, 0, %35, %38] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x5x5xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>>
  %subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 6, 1, 3] [1, 1, 1, 1] : memref<1x6x1x3xf32, strided<[?, ?, ?, 1], offset: ?>> to memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>>
  %subview_7 = memref.subview %subview_5[0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : memref<1x6x1x1xf32, strided<[150, 25, 5, 1], offset: ?>> to memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>>
  %subview_8 = memref.subview %subview_6[0, 0, 0] [1, 6, 3] [1, 1, 1] : memref<1x6x3xf32, strided<[?, ?, 1], offset: ?>> to memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
  %41 = vector.load %subview_8[%c0, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %42 = vector.load %subview_8[%c1, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %43 = vector.load %subview_8[%c2, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %44 = vector.load %subview_8[%c3, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %45 = vector.load %subview_8[%c4, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %46 = vector.load %subview_8[%c5, %c0] : memref<6x3xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, vector<3xf32>
  %subview_9 = memref.subview %subview_7[0, 0, 0] [1, 6, 1] [1, 1, 1] : memref<1x6x1xf32, strided<[150, 25, 5], offset: ?>> to memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %47 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %48 = memref.load %subview_9[%c0] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %49 = vector.broadcast %48 : f32 to vector<3xf32>
  %50 = vector.fma %41, %49, %47 : vector<3xf32>
  %51 = memref.load %subview_9[%c1] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %52 = vector.broadcast %51 : f32 to vector<3xf32>
  %53 = vector.fma %42, %52, %50 : vector<3xf32>
  %54 = memref.load %subview_9[%c2] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %55 = vector.broadcast %54 : f32 to vector<3xf32>
  %56 = vector.fma %43, %55, %53 : vector<3xf32>
  %57 = memref.load %subview_9[%c3] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %58 = vector.broadcast %57 : f32 to vector<3xf32>
  %59 = vector.fma %44, %58, %56 : vector<3xf32>
  %60 = memref.load %subview_9[%c4] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %45, %61, %59 : vector<3xf32>
  %63 = memref.load %subview_9[%c5] : memref<6xf32, affine_map<(d0)[s0] -> (d0 * 25 + s0)>>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = vector.broadcast %65 : vector<3xf32> to vector<1x1x3xf32>
  %67 = arith.addi %38, %c1 : index
  cf.br ^bb5(%67, %66 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %68 = arith.addi %35, %c1 : index
  cf.br ^bb4(%68, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %69 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %subview_10 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[252, 63, 9, 1], offset: ?>> to memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
  vector.store %69, %subview_10[%c0] : memref<3xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<3xf32>
  %70 = arith.addi %33, %c3 : index
  cf.br ^bb2(%70 : index)
^bb9:  // pred: ^bb2
  %71 = arith.addi %31, %c1 : index
  cf.br ^bb1(%71 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb9:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb9:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb9:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb9
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb10
^bb2(%33: index):  // 2 preds: ^bb1, ^bb8
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3, ^bb9
^bb3:  // pred: ^bb2
  cf.br ^bb4(%c0, %cst : index, vector<1x1x3xf32>)
^bb4(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb7
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb5(%c0, %36 : index, vector<1x1x3xf32>), ^bb8
^bb5(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb4, ^bb6
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb5(%79, %78 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb5
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb4(%80, %39 : index, vector<1x1x3xf32>)
^bb8:  // pred: ^bb4
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb9:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb10:  // pred: ^bb1
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb8
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb9
^bb2(%33: index):  // 2 preds: ^bb1, ^bb7
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8
^bb3(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb2, ^bb6
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7
^bb4(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb5
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb5, ^bb6
^bb5:  // pred: ^bb4
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb4(%79, %78 : index, vector<1x1x3xf32>)
^bb6:  // pred: ^bb4
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb3(%80, %39 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb3
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb8:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb9:  // pred: ^bb1
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb8
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb9
^bb2(%33: index):  // 2 preds: ^bb1, ^bb7
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8
^bb3(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb2, ^bb6
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7
^bb4(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb5
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb5, ^bb6
^bb5:  // pred: ^bb4
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %46 = vector.load %30[%31, %c0, %44, %45] : memref<?x?x?x?xf32>, vector<3xf32>
  %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %49 = vector.load %30[%31, %c0, %47, %48] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %52 = vector.load %30[%31, %c0, %50, %51] : memref<?x?x?x?xf32>, vector<3xf32>
  %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %54 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %55 = vector.load %30[%31, %c0, %53, %54] : memref<?x?x?x?xf32>, vector<3xf32>
  %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %58 = vector.load %30[%31, %c0, %56, %57] : memref<?x?x?x?xf32>, vector<3xf32>
  %59 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %60 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %61 = vector.broadcast %60 : f32 to vector<3xf32>
  %62 = vector.fma %43, %61, %59 : vector<3xf32>
  %63 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %64 = vector.broadcast %63 : f32 to vector<3xf32>
  %65 = vector.fma %46, %64, %62 : vector<3xf32>
  %66 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %67 = vector.broadcast %66 : f32 to vector<3xf32>
  %68 = vector.fma %49, %67, %65 : vector<3xf32>
  %69 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %70 = vector.broadcast %69 : f32 to vector<3xf32>
  %71 = vector.fma %52, %70, %68 : vector<3xf32>
  %72 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %73 = vector.broadcast %72 : f32 to vector<3xf32>
  %74 = vector.fma %55, %73, %71 : vector<3xf32>
  %75 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %76 = vector.broadcast %75 : f32 to vector<3xf32>
  %77 = vector.fma %58, %76, %74 : vector<3xf32>
  %78 = vector.broadcast %77 : vector<3xf32> to vector<1x1x3xf32>
  %79 = arith.addi %38, %c1 : index
  cf.br ^bb4(%79, %78 : index, vector<1x1x3xf32>)
^bb6:  // pred: ^bb4
  %80 = arith.addi %35, %c1 : index
  cf.br ^bb3(%80, %39 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb3
  %81 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %81, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %82 = arith.addi %33, %c3 : index
  cf.br ^bb2(%82 : index)
^bb8:  // pred: ^bb2
  %83 = arith.addi %31, %c1 : index
  cf.br ^bb1(%83 : index)
^bb9:  // pred: ^bb1
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %c4 = arith.constant 4 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
  memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
  %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
  memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
  %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
  memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  cf.br ^bb1(%c0 : index)
^bb1(%31: index):  // 2 preds: ^bb0, ^bb8
  %32 = arith.cmpi slt, %31, %c2 : index
  cf.cond_br %32, ^bb2(%c0 : index), ^bb9
^bb2(%33: index):  // 2 preds: ^bb1, ^bb7
  %34 = arith.cmpi slt, %33, %c9 : index
  cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8
^bb3(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb2, ^bb6
  %37 = arith.cmpi slt, %35, %c5 : index
  cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7
^bb4(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb5
  %40 = arith.cmpi slt, %38, %c5 : index
  cf.cond_br %40, ^bb5, ^bb6
^bb5:  // pred: ^bb4
  %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
  %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
  %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
  %45 = vector.load %30[%31, %c0, %44, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
  %47 = vector.load %30[%31, %c0, %46, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
  %49 = vector.load %30[%31, %c0, %48, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
  %51 = vector.load %30[%31, %c0, %50, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
  %53 = vector.load %30[%31, %c0, %52, %42] : memref<?x?x?x?xf32>, vector<3xf32>
  %54 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  %55 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
  %56 = vector.broadcast %55 : f32 to vector<3xf32>
  %57 = vector.fma %43, %56, %54 : vector<3xf32>
  %58 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
  %59 = vector.broadcast %58 : f32 to vector<3xf32>
  %60 = vector.fma %45, %59, %57 : vector<3xf32>
  %61 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
  %62 = vector.broadcast %61 : f32 to vector<3xf32>
  %63 = vector.fma %47, %62, %60 : vector<3xf32>
  %64 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
  %65 = vector.broadcast %64 : f32 to vector<3xf32>
  %66 = vector.fma %49, %65, %63 : vector<3xf32>
  %67 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
  %68 = vector.broadcast %67 : f32 to vector<3xf32>
  %69 = vector.fma %51, %68, %66 : vector<3xf32>
  %70 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
  %71 = vector.broadcast %70 : f32 to vector<3xf32>
  %72 = vector.fma %53, %71, %69 : vector<3xf32>
  %73 = vector.broadcast %72 : vector<3xf32> to vector<1x1x3xf32>
  %74 = arith.addi %38, %c1 : index
  cf.br ^bb4(%74, %73 : index, vector<1x1x3xf32>)
^bb6:  // pred: ^bb4
  %75 = arith.addi %35, %c1 : index
  cf.br ^bb3(%75, %39 : index, vector<1x1x3xf32>)
^bb7:  // pred: ^bb3
  %76 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
  vector.store %76, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
  %77 = arith.addi %33, %c3 : index
  cf.br ^bb2(%77 : index)
^bb8:  // pred: ^bb2
  %78 = arith.addi %31, %c1 : index
  cf.br ^bb1(%78 : index)
^bb9:  // pred: ^bb1
  return
}

// -----// IR Dump Before ConvertToLLVMPass (iree-convert-to-llvm) //----- //
module {
  func.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32() attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %c4 = arith.constant 4 : index
    %cst = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
    %c5 = arith.constant 5 : index
    %c3 = arith.constant 3 : index
    %c9 = arith.constant 9 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(7) : i32
    %8 = arith.extui %0 : i32 to i64
    %9 = arith.extui %1 : i32 to i64
    %10 = arith.shli %9, %c32_i64 : i64
    %11 = arith.ori %8, %10 : i64
    %12 = arith.index_castui %11 : i64 to index
    %13 = arith.extui %2 : i32 to i64
    %14 = arith.extui %3 : i32 to i64
    %15 = arith.shli %14, %c32_i64 : i64
    %16 = arith.ori %13, %15 : i64
    %17 = arith.index_castui %16 : i64 to index
    %18 = arith.extui %4 : i32 to i64
    %19 = arith.extui %5 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %6 : i32 to i64
    %24 = arith.extui %7 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4x6x5x5xf32>
    memref.assume_alignment %28, 64 : memref<4x6x5x5xf32>
    %29 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x4x7x9xf32>
    memref.assume_alignment %29, 64 : memref<2x4x7x9xf32>
    %30 = hal.interface.binding.subspan layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x?x?xf32>{%12, %17, %22, %27}
    memref.assume_alignment %30, 64 : memref<?x?x?x?xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    cf.br ^bb1(%c0 : index)
  ^bb1(%31: index):  // 2 preds: ^bb0, ^bb8
    %32 = arith.cmpi slt, %31, %c2 : index
    cf.cond_br %32, ^bb2(%c0 : index), ^bb9
  ^bb2(%33: index):  // 2 preds: ^bb1, ^bb7
    %34 = arith.cmpi slt, %33, %c9 : index
    cf.cond_br %34, ^bb3(%c0, %cst : index, vector<1x1x3xf32>), ^bb8
  ^bb3(%35: index, %36: vector<1x1x3xf32>):  // 2 preds: ^bb2, ^bb6
    %37 = arith.cmpi slt, %35, %c5 : index
    cf.cond_br %37, ^bb4(%c0, %36 : index, vector<1x1x3xf32>), ^bb7
  ^bb4(%38: index, %39: vector<1x1x3xf32>):  // 2 preds: ^bb3, ^bb5
    %40 = arith.cmpi slt, %38, %c5 : index
    cf.cond_br %40, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%workgroup_id_x, %35]
    %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%33, %38]
    %43 = vector.load %30[%31, %c0, %41, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%workgroup_id_x, %35]
    %45 = vector.load %30[%31, %c0, %44, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%workgroup_id_x, %35]
    %47 = vector.load %30[%31, %c0, %46, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %48 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%workgroup_id_x, %35]
    %49 = vector.load %30[%31, %c0, %48, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 4)>()[%workgroup_id_x, %35]
    %51 = vector.load %30[%31, %c0, %50, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 5)>()[%workgroup_id_x, %35]
    %53 = vector.load %30[%31, %c0, %52, %42] : memref<?x?x?x?xf32>, vector<3xf32>
    %54 = vector.extract %39[0, 0] : vector<3xf32> from vector<1x1x3xf32>
    %55 = memref.load %28[%workgroup_id_y, %c0, %35, %38] : memref<4x6x5x5xf32>
    %56 = vector.broadcast %55 : f32 to vector<3xf32>
    %57 = vector.fma %43, %56, %54 : vector<3xf32>
    %58 = memref.load %28[%workgroup_id_y, %c1, %35, %38] : memref<4x6x5x5xf32>
    %59 = vector.broadcast %58 : f32 to vector<3xf32>
    %60 = vector.fma %45, %59, %57 : vector<3xf32>
    %61 = memref.load %28[%workgroup_id_y, %c2, %35, %38] : memref<4x6x5x5xf32>
    %62 = vector.broadcast %61 : f32 to vector<3xf32>
    %63 = vector.fma %47, %62, %60 : vector<3xf32>
    %64 = memref.load %28[%workgroup_id_y, %c3, %35, %38] : memref<4x6x5x5xf32>
    %65 = vector.broadcast %64 : f32 to vector<3xf32>
    %66 = vector.fma %49, %65, %63 : vector<3xf32>
    %67 = memref.load %28[%workgroup_id_y, %c4, %35, %38] : memref<4x6x5x5xf32>
    %68 = vector.broadcast %67 : f32 to vector<3xf32>
    %69 = vector.fma %51, %68, %66 : vector<3xf32>
    %70 = memref.load %28[%workgroup_id_y, %c5, %35, %38] : memref<4x6x5x5xf32>
    %71 = vector.broadcast %70 : f32 to vector<3xf32>
    %72 = vector.fma %53, %71, %69 : vector<3xf32>
    %73 = vector.broadcast %72 : vector<3xf32> to vector<1x1x3xf32>
    %74 = arith.addi %38, %c1 : index
    cf.br ^bb4(%74, %73 : index, vector<1x1x3xf32>)
  ^bb6:  // pred: ^bb4
    %75 = arith.addi %35, %c1 : index
    cf.br ^bb3(%75, %39 : index, vector<1x1x3xf32>)
  ^bb7:  // pred: ^bb3
    %76 = vector.extract %36[0, 0] : vector<3xf32> from vector<1x1x3xf32>
    vector.store %76, %29[%31, %workgroup_id_y, %workgroup_id_x, %33] : memref<2x4x7x9xf32>, vector<3xf32>
    %77 = arith.addi %33, %c3 : index
    cf.br ^bb2(%77 : index)
  ^bb8:  // pred: ^bb2
    %78 = arith.addi %31, %c1 : index
    cf.br ^bb1(%78 : index)
  ^bb9:  // pred: ^bb1
    return
  }
}

// -----// IR Dump After ConvertToLLVMPass (iree-convert-to-llvm) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump Before ReconcileUnrealizedCasts (reconcile-unrealized-casts) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump After ReconcileUnrealizedCasts (reconcile-unrealized-casts) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump Before LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %22 = llvm.extractvalue %21[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %23 = llvm.getelementptr %22[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %26 = llvm.extractvalue %25[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %27 = llvm.getelementptr %26[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %30 = llvm.extractvalue %29[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %31 = llvm.getelementptr %30[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %32 = llvm.load %31 : !llvm.ptr -> i32
    %33 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %34 = llvm.extractvalue %33[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %35 = llvm.getelementptr %34[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %36 = llvm.load %35 : !llvm.ptr -> i32
    %37 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %38 = llvm.extractvalue %37[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %39 = llvm.getelementptr %38[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %40 = llvm.load %39 : !llvm.ptr -> i32
    %41 = llvm.zext %20 : i32 to i64
    %42 = llvm.zext %24 : i32 to i64
    %43 = llvm.shl %42, %16 : i64
    %44 = llvm.or %41, %43  : i64
    %45 = llvm.zext %28 : i32 to i64
    %46 = llvm.zext %32 : i32 to i64
    %47 = llvm.shl %46, %16 : i64
    %48 = llvm.or %45, %47  : i64
    %49 = llvm.zext %36 : i32 to i64
    %50 = llvm.zext %40 : i32 to i64
    %51 = llvm.shl %50, %16 : i64
    %52 = llvm.or %49, %51  : i64
    %53 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %54 = llvm.extractvalue %53[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %55 = llvm.getelementptr %54[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %56 = llvm.load %55 : !llvm.ptr -> !llvm.ptr
    %57 = llvm.ptrtoint %56 : !llvm.ptr to i64
    %58 = llvm.and %57, %4  : i64
    %59 = llvm.icmp "eq" %58, %15 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %61 = llvm.extractvalue %60[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %62 = llvm.getelementptr %61[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %63 = llvm.load %62 : !llvm.ptr -> !llvm.ptr
    %64 = llvm.ptrtoint %63 : !llvm.ptr to i64
    %65 = llvm.and %64, %4  : i64
    %66 = llvm.icmp "eq" %65, %15 : i64
    "llvm.intr.assume"(%66) : (i1) -> ()
    %67 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %68 = llvm.extractvalue %67[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %69 = llvm.load %68 : !llvm.ptr -> !llvm.ptr
    %70 = llvm.mul %52, %2 : i64
    %71 = llvm.mul %70, %48 : i64
    %72 = llvm.mul %71, %44 : i64
    %73 = llvm.ptrtoint %69 : !llvm.ptr to i64
    %74 = llvm.and %73, %4  : i64
    %75 = llvm.icmp "eq" %74, %15 : i64
    "llvm.intr.assume"(%75) : (i1) -> ()
    %76 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %77 = llvm.extractvalue %76[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %78 = llvm.zext %77 : i32 to i64
    %79 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %80 = llvm.extractvalue %79[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %81 = llvm.zext %80 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%82: i64):  // 2 preds: ^bb0, ^bb8
    %83 = llvm.icmp "slt" %82, %14 : i64
    llvm.cond_br %83, ^bb2(%15 : i64), ^bb9
  ^bb2(%84: i64):  // 2 preds: ^bb1, ^bb7
    %85 = llvm.icmp "slt" %84, %12 : i64
    llvm.cond_br %85, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%86: i64, %87: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %88 = llvm.icmp "slt" %86, %10 : i64
    llvm.cond_br %88, ^bb4(%15, %87 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%89: i64, %90: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %91 = llvm.icmp "slt" %89, %10 : i64
    llvm.cond_br %91, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %92 = llvm.add %78, %86 : i64
    %93 = llvm.add %84, %89 : i64
    %94 = llvm.mul %82, %72 : i64
    %95 = llvm.mul %71, %15 : i64
    %96 = llvm.add %94, %95 : i64
    %97 = llvm.mul %92, %70 : i64
    %98 = llvm.add %96, %97 : i64
    %99 = llvm.add %98, %93 : i64
    %100 = llvm.getelementptr %69[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %78, %86 : i64
    %103 = llvm.add %102, %13 : i64
    %104 = llvm.mul %82, %72 : i64
    %105 = llvm.mul %71, %15 : i64
    %106 = llvm.add %104, %105 : i64
    %107 = llvm.mul %103, %70 : i64
    %108 = llvm.add %106, %107 : i64
    %109 = llvm.add %108, %93 : i64
    %110 = llvm.getelementptr %69[%109] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %111 = llvm.load %110 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %112 = llvm.add %78, %86 : i64
    %113 = llvm.add %112, %14 : i64
    %114 = llvm.mul %82, %72 : i64
    %115 = llvm.mul %71, %15 : i64
    %116 = llvm.add %114, %115 : i64
    %117 = llvm.mul %113, %70 : i64
    %118 = llvm.add %116, %117 : i64
    %119 = llvm.add %118, %93 : i64
    %120 = llvm.getelementptr %69[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %121 = llvm.load %120 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %122 = llvm.add %78, %86 : i64
    %123 = llvm.add %122, %11 : i64
    %124 = llvm.mul %82, %72 : i64
    %125 = llvm.mul %71, %15 : i64
    %126 = llvm.add %124, %125 : i64
    %127 = llvm.mul %123, %70 : i64
    %128 = llvm.add %126, %127 : i64
    %129 = llvm.add %128, %93 : i64
    %130 = llvm.getelementptr %69[%129] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %131 = llvm.load %130 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %132 = llvm.add %78, %86 : i64
    %133 = llvm.add %132, %8 : i64
    %134 = llvm.mul %82, %72 : i64
    %135 = llvm.mul %71, %15 : i64
    %136 = llvm.add %134, %135 : i64
    %137 = llvm.mul %133, %70 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %93 : i64
    %140 = llvm.getelementptr %69[%139] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %141 = llvm.load %140 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %142 = llvm.add %78, %86 : i64
    %143 = llvm.add %142, %10 : i64
    %144 = llvm.mul %82, %72 : i64
    %145 = llvm.mul %71, %15 : i64
    %146 = llvm.add %144, %145 : i64
    %147 = llvm.mul %143, %70 : i64
    %148 = llvm.add %146, %147 : i64
    %149 = llvm.add %148, %93 : i64
    %150 = llvm.getelementptr %69[%149] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %151 = llvm.load %150 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %152 = llvm.extractvalue %90[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %153 = llvm.mul %81, %6 : i64
    %154 = llvm.mul %15, %5 : i64
    %155 = llvm.add %153, %154 : i64
    %156 = llvm.mul %86, %10 : i64
    %157 = llvm.add %155, %156 : i64
    %158 = llvm.add %157, %89 : i64
    %159 = llvm.getelementptr %56[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %160 = llvm.load %159 : !llvm.ptr -> f32
    %161 = llvm.insertelement %160, %1[%0 : i32] : vector<3xf32>
    %162 = llvm.shufflevector %161, %1 [0, 0, 0] : vector<3xf32> 
    %163 = llvm.intr.fmuladd(%101, %162, %152)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %164 = llvm.mul %81, %6 : i64
    %165 = llvm.mul %13, %5 : i64
    %166 = llvm.add %164, %165 : i64
    %167 = llvm.mul %86, %10 : i64
    %168 = llvm.add %166, %167 : i64
    %169 = llvm.add %168, %89 : i64
    %170 = llvm.getelementptr %56[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %171 = llvm.load %170 : !llvm.ptr -> f32
    %172 = llvm.insertelement %171, %1[%0 : i32] : vector<3xf32>
    %173 = llvm.shufflevector %172, %1 [0, 0, 0] : vector<3xf32> 
    %174 = llvm.intr.fmuladd(%111, %173, %163)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %175 = llvm.mul %81, %6 : i64
    %176 = llvm.mul %14, %5 : i64
    %177 = llvm.add %175, %176 : i64
    %178 = llvm.mul %86, %10 : i64
    %179 = llvm.add %177, %178 : i64
    %180 = llvm.add %179, %89 : i64
    %181 = llvm.getelementptr %56[%180] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %182 = llvm.load %181 : !llvm.ptr -> f32
    %183 = llvm.insertelement %182, %1[%0 : i32] : vector<3xf32>
    %184 = llvm.shufflevector %183, %1 [0, 0, 0] : vector<3xf32> 
    %185 = llvm.intr.fmuladd(%121, %184, %174)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %186 = llvm.mul %81, %6 : i64
    %187 = llvm.mul %11, %5 : i64
    %188 = llvm.add %186, %187 : i64
    %189 = llvm.mul %86, %10 : i64
    %190 = llvm.add %188, %189 : i64
    %191 = llvm.add %190, %89 : i64
    %192 = llvm.getelementptr %56[%191] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %193 = llvm.load %192 : !llvm.ptr -> f32
    %194 = llvm.insertelement %193, %1[%0 : i32] : vector<3xf32>
    %195 = llvm.shufflevector %194, %1 [0, 0, 0] : vector<3xf32> 
    %196 = llvm.intr.fmuladd(%131, %195, %185)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %197 = llvm.mul %81, %6 : i64
    %198 = llvm.mul %8, %5 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.mul %86, %10 : i64
    %201 = llvm.add %199, %200 : i64
    %202 = llvm.add %201, %89 : i64
    %203 = llvm.getelementptr %56[%202] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %204 = llvm.load %203 : !llvm.ptr -> f32
    %205 = llvm.insertelement %204, %1[%0 : i32] : vector<3xf32>
    %206 = llvm.shufflevector %205, %1 [0, 0, 0] : vector<3xf32> 
    %207 = llvm.intr.fmuladd(%141, %206, %196)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %208 = llvm.mul %81, %6 : i64
    %209 = llvm.mul %10, %5 : i64
    %210 = llvm.add %208, %209 : i64
    %211 = llvm.mul %86, %10 : i64
    %212 = llvm.add %210, %211 : i64
    %213 = llvm.add %212, %89 : i64
    %214 = llvm.getelementptr %56[%213] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %215 = llvm.load %214 : !llvm.ptr -> f32
    %216 = llvm.insertelement %215, %1[%0 : i32] : vector<3xf32>
    %217 = llvm.shufflevector %216, %1 [0, 0, 0] : vector<3xf32> 
    %218 = llvm.intr.fmuladd(%151, %217, %207)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %219 = llvm.insertvalue %218, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %220 = llvm.insertvalue %219, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %221 = llvm.add %89, %13 : i64
    llvm.br ^bb4(%221, %220 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %222 = llvm.add %86, %13 : i64
    llvm.br ^bb3(%222, %90 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %223 = llvm.extractvalue %87[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %224 = llvm.mul %82, %3 : i64
    %225 = llvm.mul %81, %4 : i64
    %226 = llvm.add %224, %225 : i64
    %227 = llvm.mul %78, %12 : i64
    %228 = llvm.add %226, %227 : i64
    %229 = llvm.add %228, %84 : i64
    %230 = llvm.getelementptr %63[%229] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %223, %230 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %231 = llvm.add %84, %11 : i64
    llvm.br ^bb2(%231 : i64)
  ^bb8:  // pred: ^bb2
    %232 = llvm.add %82, %13 : i64
    llvm.br ^bb1(%232 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump After CSE (cse) //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.undef : vector<3xf32>
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(252 : index) : i64
    %4 = llvm.mlir.constant(63 : index) : i64
    %5 = llvm.mlir.constant(25 : index) : i64
    %6 = llvm.mlir.constant(150 : index) : i64
    %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
    %8 = llvm.mlir.constant(4 : index) : i64
    %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
    %10 = llvm.mlir.constant(5 : index) : i64
    %11 = llvm.mlir.constant(3 : index) : i64
    %12 = llvm.mlir.constant(9 : index) : i64
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.mlir.constant(2 : index) : i64
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(32 : i64) : i64
    %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
    %20 = llvm.load %19 : !llvm.ptr -> i32
    %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
    %22 = llvm.load %21 : !llvm.ptr -> i32
    %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
    %24 = llvm.load %23 : !llvm.ptr -> i32
    %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
    %26 = llvm.load %25 : !llvm.ptr -> i32
    %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
    %28 = llvm.load %27 : !llvm.ptr -> i32
    %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
    %30 = llvm.load %29 : !llvm.ptr -> i32
    %31 = llvm.zext %20 : i32 to i64
    %32 = llvm.zext %22 : i32 to i64
    %33 = llvm.shl %32, %16 : i64
    %34 = llvm.or %31, %33  : i64
    %35 = llvm.zext %24 : i32 to i64
    %36 = llvm.zext %26 : i32 to i64
    %37 = llvm.shl %36, %16 : i64
    %38 = llvm.or %35, %37  : i64
    %39 = llvm.zext %28 : i32 to i64
    %40 = llvm.zext %30 : i32 to i64
    %41 = llvm.shl %40, %16 : i64
    %42 = llvm.or %39, %41  : i64
    %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
    %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
    %47 = llvm.and %46, %4  : i64
    %48 = llvm.icmp "eq" %47, %15 : i64
    "llvm.intr.assume"(%48) : (i1) -> ()
    %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
    %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
    %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
    %54 = llvm.and %53, %4  : i64
    %55 = llvm.icmp "eq" %54, %15 : i64
    "llvm.intr.assume"(%55) : (i1) -> ()
    %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
    %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
    %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
    %59 = llvm.mul %42, %2 : i64
    %60 = llvm.mul %59, %38 : i64
    %61 = llvm.mul %60, %34 : i64
    %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
    %63 = llvm.and %62, %4  : i64
    %64 = llvm.icmp "eq" %63, %15 : i64
    "llvm.intr.assume"(%64) : (i1) -> ()
    %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
    %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %67 = llvm.zext %66 : i32 to i64
    %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
    %69 = llvm.zext %68 : i32 to i64
    llvm.br ^bb1(%15 : i64)
  ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
    %71 = llvm.icmp "slt" %70, %14 : i64
    llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
  ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
    %73 = llvm.icmp "slt" %72, %12 : i64
    llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
  ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
    %76 = llvm.icmp "slt" %74, %10 : i64
    llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
  ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
    %79 = llvm.icmp "slt" %77, %10 : i64
    llvm.cond_br %79, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %80 = llvm.add %67, %74 : i64
    %81 = llvm.add %72, %77 : i64
    %82 = llvm.mul %70, %61 : i64
    %83 = llvm.mul %60, %15 : i64
    %84 = llvm.add %82, %83 : i64
    %85 = llvm.mul %80, %59 : i64
    %86 = llvm.add %84, %85 : i64
    %87 = llvm.add %86, %81 : i64
    %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %90 = llvm.add %80, %13 : i64
    %91 = llvm.mul %90, %59 : i64
    %92 = llvm.add %84, %91 : i64
    %93 = llvm.add %92, %81 : i64
    %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %96 = llvm.add %80, %14 : i64
    %97 = llvm.mul %96, %59 : i64
    %98 = llvm.add %84, %97 : i64
    %99 = llvm.add %98, %81 : i64
    %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %102 = llvm.add %80, %11 : i64
    %103 = llvm.mul %102, %59 : i64
    %104 = llvm.add %84, %103 : i64
    %105 = llvm.add %104, %81 : i64
    %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %108 = llvm.add %80, %8 : i64
    %109 = llvm.mul %108, %59 : i64
    %110 = llvm.add %84, %109 : i64
    %111 = llvm.add %110, %81 : i64
    %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %114 = llvm.add %80, %10 : i64
    %115 = llvm.mul %114, %59 : i64
    %116 = llvm.add %84, %115 : i64
    %117 = llvm.add %116, %81 : i64
    %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
    %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %121 = llvm.mul %69, %6 : i64
    %122 = llvm.mul %15, %5 : i64
    %123 = llvm.add %121, %122 : i64
    %124 = llvm.mul %74, %10 : i64
    %125 = llvm.add %123, %124 : i64
    %126 = llvm.add %125, %77 : i64
    %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %128 = llvm.load %127 : !llvm.ptr -> f32
    %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
    %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
    %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %132 = llvm.mul %13, %5 : i64
    %133 = llvm.add %121, %132 : i64
    %134 = llvm.add %133, %124 : i64
    %135 = llvm.add %134, %77 : i64
    %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %137 = llvm.load %136 : !llvm.ptr -> f32
    %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
    %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
    %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %141 = llvm.mul %14, %5 : i64
    %142 = llvm.add %121, %141 : i64
    %143 = llvm.add %142, %124 : i64
    %144 = llvm.add %143, %77 : i64
    %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %146 = llvm.load %145 : !llvm.ptr -> f32
    %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
    %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
    %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %150 = llvm.mul %11, %5 : i64
    %151 = llvm.add %121, %150 : i64
    %152 = llvm.add %151, %124 : i64
    %153 = llvm.add %152, %77 : i64
    %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %155 = llvm.load %154 : !llvm.ptr -> f32
    %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
    %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
    %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %159 = llvm.mul %8, %5 : i64
    %160 = llvm.add %121, %159 : i64
    %161 = llvm.add %160, %124 : i64
    %162 = llvm.add %161, %77 : i64
    %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %164 = llvm.load %163 : !llvm.ptr -> f32
    %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
    %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
    %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %168 = llvm.mul %10, %5 : i64
    %169 = llvm.add %121, %168 : i64
    %170 = llvm.add %169, %124 : i64
    %171 = llvm.add %170, %77 : i64
    %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %173 = llvm.load %172 : !llvm.ptr -> f32
    %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
    %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
    %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
    %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
    %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %179 = llvm.add %77, %13 : i64
    llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb6:  // pred: ^bb4
    %180 = llvm.add %74, %13 : i64
    llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
  ^bb7:  // pred: ^bb3
    %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
    %182 = llvm.mul %70, %3 : i64
    %183 = llvm.mul %69, %4 : i64
    %184 = llvm.add %182, %183 : i64
    %185 = llvm.mul %67, %12 : i64
    %186 = llvm.add %184, %185 : i64
    %187 = llvm.add %186, %72 : i64
    %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
    %189 = llvm.add %72, %11 : i64
    llvm.br ^bb2(%189 : i64)
  ^bb8:  // pred: ^bb2
    %190 = llvm.add %70, %13 : i64
    llvm.br ^bb1(%190 : i64)
  ^bb9:  // pred: ^bb1
    llvm.return %0 : i32
  }
}

// -----// IR Dump Before AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- //
llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %0 = llvm.mlir.constant(0 : i32) : i32
  %1 = llvm.mlir.undef : vector<3xf32>
  %2 = llvm.mlir.constant(1 : i64) : i64
  %3 = llvm.mlir.constant(252 : index) : i64
  %4 = llvm.mlir.constant(63 : index) : i64
  %5 = llvm.mlir.constant(25 : index) : i64
  %6 = llvm.mlir.constant(150 : index) : i64
  %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
  %8 = llvm.mlir.constant(4 : index) : i64
  %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
  %10 = llvm.mlir.constant(5 : index) : i64
  %11 = llvm.mlir.constant(3 : index) : i64
  %12 = llvm.mlir.constant(9 : index) : i64
  %13 = llvm.mlir.constant(1 : index) : i64
  %14 = llvm.mlir.constant(2 : index) : i64
  %15 = llvm.mlir.constant(0 : index) : i64
  %16 = llvm.mlir.constant(32 : i64) : i64
  %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
  %20 = llvm.load %19 : !llvm.ptr -> i32
  %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
  %22 = llvm.load %21 : !llvm.ptr -> i32
  %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
  %24 = llvm.load %23 : !llvm.ptr -> i32
  %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
  %26 = llvm.load %25 : !llvm.ptr -> i32
  %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
  %28 = llvm.load %27 : !llvm.ptr -> i32
  %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
  %30 = llvm.load %29 : !llvm.ptr -> i32
  %31 = llvm.zext %20 : i32 to i64
  %32 = llvm.zext %22 : i32 to i64
  %33 = llvm.shl %32, %16 : i64
  %34 = llvm.or %31, %33  : i64
  %35 = llvm.zext %24 : i32 to i64
  %36 = llvm.zext %26 : i32 to i64
  %37 = llvm.shl %36, %16 : i64
  %38 = llvm.or %35, %37  : i64
  %39 = llvm.zext %28 : i32 to i64
  %40 = llvm.zext %30 : i32 to i64
  %41 = llvm.shl %40, %16 : i64
  %42 = llvm.or %39, %41  : i64
  %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
  %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
  %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
  %47 = llvm.and %46, %4  : i64
  %48 = llvm.icmp "eq" %47, %15 : i64
  "llvm.intr.assume"(%48) : (i1) -> ()
  %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
  %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
  %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
  %54 = llvm.and %53, %4  : i64
  %55 = llvm.icmp "eq" %54, %15 : i64
  "llvm.intr.assume"(%55) : (i1) -> ()
  %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
  %59 = llvm.mul %42, %2 : i64
  %60 = llvm.mul %59, %38 : i64
  %61 = llvm.mul %60, %34 : i64
  %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
  %63 = llvm.and %62, %4  : i64
  %64 = llvm.icmp "eq" %63, %15 : i64
  "llvm.intr.assume"(%64) : (i1) -> ()
  %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
  %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
  %67 = llvm.zext %66 : i32 to i64
  %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
  %69 = llvm.zext %68 : i32 to i64
  llvm.br ^bb1(%15 : i64)
^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
  %71 = llvm.icmp "slt" %70, %14 : i64
  llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
  %73 = llvm.icmp "slt" %72, %12 : i64
  llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
  %76 = llvm.icmp "slt" %74, %10 : i64
  llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
  %79 = llvm.icmp "slt" %77, %10 : i64
  llvm.cond_br %79, ^bb5, ^bb6
^bb5:  // pred: ^bb4
  %80 = llvm.add %67, %74 : i64
  %81 = llvm.add %72, %77 : i64
  %82 = llvm.mul %70, %61 : i64
  %83 = llvm.mul %60, %15 : i64
  %84 = llvm.add %82, %83 : i64
  %85 = llvm.mul %80, %59 : i64
  %86 = llvm.add %84, %85 : i64
  %87 = llvm.add %86, %81 : i64
  %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %90 = llvm.add %80, %13 : i64
  %91 = llvm.mul %90, %59 : i64
  %92 = llvm.add %84, %91 : i64
  %93 = llvm.add %92, %81 : i64
  %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %96 = llvm.add %80, %14 : i64
  %97 = llvm.mul %96, %59 : i64
  %98 = llvm.add %84, %97 : i64
  %99 = llvm.add %98, %81 : i64
  %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %102 = llvm.add %80, %11 : i64
  %103 = llvm.mul %102, %59 : i64
  %104 = llvm.add %84, %103 : i64
  %105 = llvm.add %104, %81 : i64
  %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %108 = llvm.add %80, %8 : i64
  %109 = llvm.mul %108, %59 : i64
  %110 = llvm.add %84, %109 : i64
  %111 = llvm.add %110, %81 : i64
  %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %114 = llvm.add %80, %10 : i64
  %115 = llvm.mul %114, %59 : i64
  %116 = llvm.add %84, %115 : i64
  %117 = llvm.add %116, %81 : i64
  %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %121 = llvm.mul %69, %6 : i64
  %122 = llvm.mul %15, %5 : i64
  %123 = llvm.add %121, %122 : i64
  %124 = llvm.mul %74, %10 : i64
  %125 = llvm.add %123, %124 : i64
  %126 = llvm.add %125, %77 : i64
  %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %128 = llvm.load %127 : !llvm.ptr -> f32
  %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
  %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
  %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %132 = llvm.mul %13, %5 : i64
  %133 = llvm.add %121, %132 : i64
  %134 = llvm.add %133, %124 : i64
  %135 = llvm.add %134, %77 : i64
  %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %137 = llvm.load %136 : !llvm.ptr -> f32
  %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
  %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
  %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %141 = llvm.mul %14, %5 : i64
  %142 = llvm.add %121, %141 : i64
  %143 = llvm.add %142, %124 : i64
  %144 = llvm.add %143, %77 : i64
  %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %146 = llvm.load %145 : !llvm.ptr -> f32
  %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
  %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
  %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %150 = llvm.mul %11, %5 : i64
  %151 = llvm.add %121, %150 : i64
  %152 = llvm.add %151, %124 : i64
  %153 = llvm.add %152, %77 : i64
  %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %155 = llvm.load %154 : !llvm.ptr -> f32
  %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
  %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
  %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %159 = llvm.mul %8, %5 : i64
  %160 = llvm.add %121, %159 : i64
  %161 = llvm.add %160, %124 : i64
  %162 = llvm.add %161, %77 : i64
  %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %164 = llvm.load %163 : !llvm.ptr -> f32
  %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
  %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
  %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %168 = llvm.mul %10, %5 : i64
  %169 = llvm.add %121, %168 : i64
  %170 = llvm.add %169, %124 : i64
  %171 = llvm.add %170, %77 : i64
  %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %173 = llvm.load %172 : !llvm.ptr -> f32
  %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
  %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
  %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
  %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %179 = llvm.add %77, %13 : i64
  llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
^bb6:  // pred: ^bb4
  %180 = llvm.add %74, %13 : i64
  llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
^bb7:  // pred: ^bb3
  %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %182 = llvm.mul %70, %3 : i64
  %183 = llvm.mul %69, %4 : i64
  %184 = llvm.add %182, %183 : i64
  %185 = llvm.mul %67, %12 : i64
  %186 = llvm.add %184, %185 : i64
  %187 = llvm.add %186, %72 : i64
  %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
  %189 = llvm.add %72, %11 : i64
  llvm.br ^bb2(%189 : i64)
^bb8:  // pred: ^bb2
  %190 = llvm.add %70, %13 : i64
  llvm.br ^bb1(%190 : i64)
^bb9:  // pred: ^bb1
  llvm.return %0 : i32
}

// -----// IR Dump After AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- //
llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  %0 = llvm.mlir.constant(0 : i32) : i32
  %1 = llvm.mlir.undef : vector<3xf32>
  %2 = llvm.mlir.constant(1 : i64) : i64
  %3 = llvm.mlir.constant(252 : index) : i64
  %4 = llvm.mlir.constant(63 : index) : i64
  %5 = llvm.mlir.constant(25 : index) : i64
  %6 = llvm.mlir.constant(150 : index) : i64
  %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
  %8 = llvm.mlir.constant(4 : index) : i64
  %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
  %10 = llvm.mlir.constant(5 : index) : i64
  %11 = llvm.mlir.constant(3 : index) : i64
  %12 = llvm.mlir.constant(9 : index) : i64
  %13 = llvm.mlir.constant(1 : index) : i64
  %14 = llvm.mlir.constant(2 : index) : i64
  %15 = llvm.mlir.constant(0 : index) : i64
  %16 = llvm.mlir.constant(32 : i64) : i64
  %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
  %20 = llvm.load %19 : !llvm.ptr -> i32
  %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
  %22 = llvm.load %21 : !llvm.ptr -> i32
  %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
  %24 = llvm.load %23 : !llvm.ptr -> i32
  %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
  %26 = llvm.load %25 : !llvm.ptr -> i32
  %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
  %28 = llvm.load %27 : !llvm.ptr -> i32
  %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
  %30 = llvm.load %29 : !llvm.ptr -> i32
  %31 = llvm.zext %20 : i32 to i64
  %32 = llvm.zext %22 : i32 to i64
  %33 = llvm.shl %32, %16 : i64
  %34 = llvm.or %31, %33  : i64
  %35 = llvm.zext %24 : i32 to i64
  %36 = llvm.zext %26 : i32 to i64
  %37 = llvm.shl %36, %16 : i64
  %38 = llvm.or %35, %37  : i64
  %39 = llvm.zext %28 : i32 to i64
  %40 = llvm.zext %30 : i32 to i64
  %41 = llvm.shl %40, %16 : i64
  %42 = llvm.or %39, %41  : i64
  %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
  %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
  %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
  %47 = llvm.and %46, %4  : i64
  %48 = llvm.icmp "eq" %47, %15 : i64
  "llvm.intr.assume"(%48) : (i1) -> ()
  %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
  %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
  %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
  %54 = llvm.and %53, %4  : i64
  %55 = llvm.icmp "eq" %54, %15 : i64
  "llvm.intr.assume"(%55) : (i1) -> ()
  %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
  %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
  %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
  %59 = llvm.mul %42, %2 : i64
  %60 = llvm.mul %59, %38 : i64
  %61 = llvm.mul %60, %34 : i64
  %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
  %63 = llvm.and %62, %4  : i64
  %64 = llvm.icmp "eq" %63, %15 : i64
  "llvm.intr.assume"(%64) : (i1) -> ()
  %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
  %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
  %67 = llvm.zext %66 : i32 to i64
  %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
  %69 = llvm.zext %68 : i32 to i64
  llvm.br ^bb1(%15 : i64)
^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
  %71 = llvm.icmp "slt" %70, %14 : i64
  llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
  %73 = llvm.icmp "slt" %72, %12 : i64
  llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
  %76 = llvm.icmp "slt" %74, %10 : i64
  llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
  %79 = llvm.icmp "slt" %77, %10 : i64
  llvm.cond_br %79, ^bb5, ^bb6
^bb5:  // pred: ^bb4
  %80 = llvm.add %67, %74 : i64
  %81 = llvm.add %72, %77 : i64
  %82 = llvm.mul %70, %61 : i64
  %83 = llvm.mul %60, %15 : i64
  %84 = llvm.add %82, %83 : i64
  %85 = llvm.mul %80, %59 : i64
  %86 = llvm.add %84, %85 : i64
  %87 = llvm.add %86, %81 : i64
  %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %90 = llvm.add %80, %13 : i64
  %91 = llvm.mul %90, %59 : i64
  %92 = llvm.add %84, %91 : i64
  %93 = llvm.add %92, %81 : i64
  %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %96 = llvm.add %80, %14 : i64
  %97 = llvm.mul %96, %59 : i64
  %98 = llvm.add %84, %97 : i64
  %99 = llvm.add %98, %81 : i64
  %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %102 = llvm.add %80, %11 : i64
  %103 = llvm.mul %102, %59 : i64
  %104 = llvm.add %84, %103 : i64
  %105 = llvm.add %104, %81 : i64
  %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %108 = llvm.add %80, %8 : i64
  %109 = llvm.mul %108, %59 : i64
  %110 = llvm.add %84, %109 : i64
  %111 = llvm.add %110, %81 : i64
  %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %114 = llvm.add %80, %10 : i64
  %115 = llvm.mul %114, %59 : i64
  %116 = llvm.add %84, %115 : i64
  %117 = llvm.add %116, %81 : i64
  %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
  %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %121 = llvm.mul %69, %6 : i64
  %122 = llvm.mul %15, %5 : i64
  %123 = llvm.add %121, %122 : i64
  %124 = llvm.mul %74, %10 : i64
  %125 = llvm.add %123, %124 : i64
  %126 = llvm.add %125, %77 : i64
  %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %128 = llvm.load %127 : !llvm.ptr -> f32
  %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
  %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
  %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %132 = llvm.mul %13, %5 : i64
  %133 = llvm.add %121, %132 : i64
  %134 = llvm.add %133, %124 : i64
  %135 = llvm.add %134, %77 : i64
  %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %137 = llvm.load %136 : !llvm.ptr -> f32
  %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
  %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
  %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %141 = llvm.mul %14, %5 : i64
  %142 = llvm.add %121, %141 : i64
  %143 = llvm.add %142, %124 : i64
  %144 = llvm.add %143, %77 : i64
  %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %146 = llvm.load %145 : !llvm.ptr -> f32
  %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
  %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
  %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %150 = llvm.mul %11, %5 : i64
  %151 = llvm.add %121, %150 : i64
  %152 = llvm.add %151, %124 : i64
  %153 = llvm.add %152, %77 : i64
  %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %155 = llvm.load %154 : !llvm.ptr -> f32
  %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
  %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
  %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %159 = llvm.mul %8, %5 : i64
  %160 = llvm.add %121, %159 : i64
  %161 = llvm.add %160, %124 : i64
  %162 = llvm.add %161, %77 : i64
  %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %164 = llvm.load %163 : !llvm.ptr -> f32
  %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
  %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
  %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %168 = llvm.mul %10, %5 : i64
  %169 = llvm.add %121, %168 : i64
  %170 = llvm.add %169, %124 : i64
  %171 = llvm.add %170, %77 : i64
  %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  %173 = llvm.load %172 : !llvm.ptr -> f32
  %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
  %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
  %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
  %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
  %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %179 = llvm.add %77, %13 : i64
  llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
^bb6:  // pred: ^bb4
  %180 = llvm.add %74, %13 : i64
  llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
^bb7:  // pred: ^bb3
  %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
  %182 = llvm.mul %70, %3 : i64
  %183 = llvm.mul %69, %4 : i64
  %184 = llvm.add %182, %183 : i64
  %185 = llvm.mul %67, %12 : i64
  %186 = llvm.add %184, %185 : i64
  %187 = llvm.add %186, %72 : i64
  %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
  llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
  %189 = llvm.add %72, %11 : i64
  llvm.br ^bb2(%189 : i64)
^bb8:  // pred: ^bb2
  %190 = llvm.add %70, %13 : i64
  llvm.br ^bb1(%190 : i64)
^bb9:  // pred: ^bb1
  llvm.return %0 : i32
}

// -----// IR Dump After TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %c7 = arith.constant 7 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c7, %c4, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.undef : vector<3xf32>
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(252 : index) : i64
      %4 = llvm.mlir.constant(63 : index) : i64
      %5 = llvm.mlir.constant(25 : index) : i64
      %6 = llvm.mlir.constant(150 : index) : i64
      %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
      %8 = llvm.mlir.constant(4 : index) : i64
      %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
      %10 = llvm.mlir.constant(5 : index) : i64
      %11 = llvm.mlir.constant(3 : index) : i64
      %12 = llvm.mlir.constant(9 : index) : i64
      %13 = llvm.mlir.constant(1 : index) : i64
      %14 = llvm.mlir.constant(2 : index) : i64
      %15 = llvm.mlir.constant(0 : index) : i64
      %16 = llvm.mlir.constant(32 : i64) : i64
      %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
      %20 = llvm.load %19 : !llvm.ptr -> i32
      %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
      %22 = llvm.load %21 : !llvm.ptr -> i32
      %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
      %24 = llvm.load %23 : !llvm.ptr -> i32
      %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
      %26 = llvm.load %25 : !llvm.ptr -> i32
      %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
      %28 = llvm.load %27 : !llvm.ptr -> i32
      %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
      %30 = llvm.load %29 : !llvm.ptr -> i32
      %31 = llvm.zext %20 : i32 to i64
      %32 = llvm.zext %22 : i32 to i64
      %33 = llvm.shl %32, %16 : i64
      %34 = llvm.or %31, %33  : i64
      %35 = llvm.zext %24 : i32 to i64
      %36 = llvm.zext %26 : i32 to i64
      %37 = llvm.shl %36, %16 : i64
      %38 = llvm.or %35, %37  : i64
      %39 = llvm.zext %28 : i32 to i64
      %40 = llvm.zext %30 : i32 to i64
      %41 = llvm.shl %40, %16 : i64
      %42 = llvm.or %39, %41  : i64
      %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
      %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
      %47 = llvm.and %46, %4  : i64
      %48 = llvm.icmp "eq" %47, %15 : i64
      "llvm.intr.assume"(%48) : (i1) -> ()
      %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
      %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
      %54 = llvm.and %53, %4  : i64
      %55 = llvm.icmp "eq" %54, %15 : i64
      "llvm.intr.assume"(%55) : (i1) -> ()
      %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
      %59 = llvm.mul %42, %2 : i64
      %60 = llvm.mul %59, %38 : i64
      %61 = llvm.mul %60, %34 : i64
      %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
      %63 = llvm.and %62, %4  : i64
      %64 = llvm.icmp "eq" %63, %15 : i64
      "llvm.intr.assume"(%64) : (i1) -> ()
      %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
      %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %67 = llvm.zext %66 : i32 to i64
      %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %69 = llvm.zext %68 : i32 to i64
      llvm.br ^bb1(%15 : i64)
    ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
      %71 = llvm.icmp "slt" %70, %14 : i64
      llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
    ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
      %73 = llvm.icmp "slt" %72, %12 : i64
      llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
    ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
      %76 = llvm.icmp "slt" %74, %10 : i64
      llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
    ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
      %79 = llvm.icmp "slt" %77, %10 : i64
      llvm.cond_br %79, ^bb5, ^bb6
    ^bb5:  // pred: ^bb4
      %80 = llvm.add %67, %74 : i64
      %81 = llvm.add %72, %77 : i64
      %82 = llvm.mul %70, %61 : i64
      %83 = llvm.mul %60, %15 : i64
      %84 = llvm.add %82, %83 : i64
      %85 = llvm.mul %80, %59 : i64
      %86 = llvm.add %84, %85 : i64
      %87 = llvm.add %86, %81 : i64
      %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %90 = llvm.add %80, %13 : i64
      %91 = llvm.mul %90, %59 : i64
      %92 = llvm.add %84, %91 : i64
      %93 = llvm.add %92, %81 : i64
      %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %96 = llvm.add %80, %14 : i64
      %97 = llvm.mul %96, %59 : i64
      %98 = llvm.add %84, %97 : i64
      %99 = llvm.add %98, %81 : i64
      %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %102 = llvm.add %80, %11 : i64
      %103 = llvm.mul %102, %59 : i64
      %104 = llvm.add %84, %103 : i64
      %105 = llvm.add %104, %81 : i64
      %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %108 = llvm.add %80, %8 : i64
      %109 = llvm.mul %108, %59 : i64
      %110 = llvm.add %84, %109 : i64
      %111 = llvm.add %110, %81 : i64
      %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %114 = llvm.add %80, %10 : i64
      %115 = llvm.mul %114, %59 : i64
      %116 = llvm.add %84, %115 : i64
      %117 = llvm.add %116, %81 : i64
      %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %121 = llvm.mul %69, %6 : i64
      %122 = llvm.mul %15, %5 : i64
      %123 = llvm.add %121, %122 : i64
      %124 = llvm.mul %74, %10 : i64
      %125 = llvm.add %123, %124 : i64
      %126 = llvm.add %125, %77 : i64
      %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %128 = llvm.load %127 : !llvm.ptr -> f32
      %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
      %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
      %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %132 = llvm.mul %13, %5 : i64
      %133 = llvm.add %121, %132 : i64
      %134 = llvm.add %133, %124 : i64
      %135 = llvm.add %134, %77 : i64
      %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %137 = llvm.load %136 : !llvm.ptr -> f32
      %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
      %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
      %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %141 = llvm.mul %14, %5 : i64
      %142 = llvm.add %121, %141 : i64
      %143 = llvm.add %142, %124 : i64
      %144 = llvm.add %143, %77 : i64
      %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %146 = llvm.load %145 : !llvm.ptr -> f32
      %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
      %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
      %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %150 = llvm.mul %11, %5 : i64
      %151 = llvm.add %121, %150 : i64
      %152 = llvm.add %151, %124 : i64
      %153 = llvm.add %152, %77 : i64
      %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %155 = llvm.load %154 : !llvm.ptr -> f32
      %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
      %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
      %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %159 = llvm.mul %8, %5 : i64
      %160 = llvm.add %121, %159 : i64
      %161 = llvm.add %160, %124 : i64
      %162 = llvm.add %161, %77 : i64
      %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %164 = llvm.load %163 : !llvm.ptr -> f32
      %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
      %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
      %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %168 = llvm.mul %10, %5 : i64
      %169 = llvm.add %121, %168 : i64
      %170 = llvm.add %169, %124 : i64
      %171 = llvm.add %170, %77 : i64
      %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %173 = llvm.load %172 : !llvm.ptr -> f32
      %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
      %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
      %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
      %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %179 = llvm.add %77, %13 : i64
      llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb6:  // pred: ^bb4
      %180 = llvm.add %74, %13 : i64
      llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb7:  // pred: ^bb3
      %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %182 = llvm.mul %70, %3 : i64
      %183 = llvm.mul %69, %4 : i64
      %184 = llvm.add %182, %183 : i64
      %185 = llvm.mul %67, %12 : i64
      %186 = llvm.add %184, %185 : i64
      %187 = llvm.add %186, %72 : i64
      %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
      %189 = llvm.add %72, %11 : i64
      llvm.br ^bb2(%189 : i64)
    ^bb8:  // pred: ^bb2
      %190 = llvm.add %70, %13 : i64
      llvm.br ^bb1(%190 : i64)
    ^bb9:  // pred: ^bb1
      llvm.return %0 : i32
    }
  }
}

// -----// IR Dump After TranslateExecutablesPass (iree-hal-translate-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %c7 = arith.constant 7 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c7, %c4, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.undef : vector<3xf32>
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(252 : index) : i64
        %4 = llvm.mlir.constant(63 : index) : i64
        %5 = llvm.mlir.constant(25 : index) : i64
        %6 = llvm.mlir.constant(150 : index) : i64
        %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
        %8 = llvm.mlir.constant(4 : index) : i64
        %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
        %10 = llvm.mlir.constant(5 : index) : i64
        %11 = llvm.mlir.constant(3 : index) : i64
        %12 = llvm.mlir.constant(9 : index) : i64
        %13 = llvm.mlir.constant(1 : index) : i64
        %14 = llvm.mlir.constant(2 : index) : i64
        %15 = llvm.mlir.constant(0 : index) : i64
        %16 = llvm.mlir.constant(32 : i64) : i64
        %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
        %20 = llvm.load %19 : !llvm.ptr -> i32
        %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
        %22 = llvm.load %21 : !llvm.ptr -> i32
        %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
        %24 = llvm.load %23 : !llvm.ptr -> i32
        %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
        %26 = llvm.load %25 : !llvm.ptr -> i32
        %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
        %28 = llvm.load %27 : !llvm.ptr -> i32
        %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
        %30 = llvm.load %29 : !llvm.ptr -> i32
        %31 = llvm.zext %20 : i32 to i64
        %32 = llvm.zext %22 : i32 to i64
        %33 = llvm.shl %32, %16 : i64
        %34 = llvm.or %31, %33  : i64
        %35 = llvm.zext %24 : i32 to i64
        %36 = llvm.zext %26 : i32 to i64
        %37 = llvm.shl %36, %16 : i64
        %38 = llvm.or %35, %37  : i64
        %39 = llvm.zext %28 : i32 to i64
        %40 = llvm.zext %30 : i32 to i64
        %41 = llvm.shl %40, %16 : i64
        %42 = llvm.or %39, %41  : i64
        %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
        %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
        %47 = llvm.and %46, %4  : i64
        %48 = llvm.icmp "eq" %47, %15 : i64
        "llvm.intr.assume"(%48) : (i1) -> ()
        %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
        %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
        %54 = llvm.and %53, %4  : i64
        %55 = llvm.icmp "eq" %54, %15 : i64
        "llvm.intr.assume"(%55) : (i1) -> ()
        %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
        %59 = llvm.mul %42, %2 : i64
        %60 = llvm.mul %59, %38 : i64
        %61 = llvm.mul %60, %34 : i64
        %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
        %63 = llvm.and %62, %4  : i64
        %64 = llvm.icmp "eq" %63, %15 : i64
        "llvm.intr.assume"(%64) : (i1) -> ()
        %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
        %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %67 = llvm.zext %66 : i32 to i64
        %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %69 = llvm.zext %68 : i32 to i64
        llvm.br ^bb1(%15 : i64)
      ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
        %71 = llvm.icmp "slt" %70, %14 : i64
        llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
      ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
        %73 = llvm.icmp "slt" %72, %12 : i64
        llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
      ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
        %76 = llvm.icmp "slt" %74, %10 : i64
        llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
      ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
        %79 = llvm.icmp "slt" %77, %10 : i64
        llvm.cond_br %79, ^bb5, ^bb6
      ^bb5:  // pred: ^bb4
        %80 = llvm.add %67, %74 : i64
        %81 = llvm.add %72, %77 : i64
        %82 = llvm.mul %70, %61 : i64
        %83 = llvm.mul %60, %15 : i64
        %84 = llvm.add %82, %83 : i64
        %85 = llvm.mul %80, %59 : i64
        %86 = llvm.add %84, %85 : i64
        %87 = llvm.add %86, %81 : i64
        %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %90 = llvm.add %80, %13 : i64
        %91 = llvm.mul %90, %59 : i64
        %92 = llvm.add %84, %91 : i64
        %93 = llvm.add %92, %81 : i64
        %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %96 = llvm.add %80, %14 : i64
        %97 = llvm.mul %96, %59 : i64
        %98 = llvm.add %84, %97 : i64
        %99 = llvm.add %98, %81 : i64
        %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %102 = llvm.add %80, %11 : i64
        %103 = llvm.mul %102, %59 : i64
        %104 = llvm.add %84, %103 : i64
        %105 = llvm.add %104, %81 : i64
        %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %108 = llvm.add %80, %8 : i64
        %109 = llvm.mul %108, %59 : i64
        %110 = llvm.add %84, %109 : i64
        %111 = llvm.add %110, %81 : i64
        %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %114 = llvm.add %80, %10 : i64
        %115 = llvm.mul %114, %59 : i64
        %116 = llvm.add %84, %115 : i64
        %117 = llvm.add %116, %81 : i64
        %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %121 = llvm.mul %69, %6 : i64
        %122 = llvm.mul %15, %5 : i64
        %123 = llvm.add %121, %122 : i64
        %124 = llvm.mul %74, %10 : i64
        %125 = llvm.add %123, %124 : i64
        %126 = llvm.add %125, %77 : i64
        %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %128 = llvm.load %127 : !llvm.ptr -> f32
        %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
        %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
        %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %132 = llvm.mul %13, %5 : i64
        %133 = llvm.add %121, %132 : i64
        %134 = llvm.add %133, %124 : i64
        %135 = llvm.add %134, %77 : i64
        %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %137 = llvm.load %136 : !llvm.ptr -> f32
        %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
        %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
        %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %141 = llvm.mul %14, %5 : i64
        %142 = llvm.add %121, %141 : i64
        %143 = llvm.add %142, %124 : i64
        %144 = llvm.add %143, %77 : i64
        %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %146 = llvm.load %145 : !llvm.ptr -> f32
        %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
        %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
        %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %150 = llvm.mul %11, %5 : i64
        %151 = llvm.add %121, %150 : i64
        %152 = llvm.add %151, %124 : i64
        %153 = llvm.add %152, %77 : i64
        %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %155 = llvm.load %154 : !llvm.ptr -> f32
        %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
        %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
        %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %159 = llvm.mul %8, %5 : i64
        %160 = llvm.add %121, %159 : i64
        %161 = llvm.add %160, %124 : i64
        %162 = llvm.add %161, %77 : i64
        %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %164 = llvm.load %163 : !llvm.ptr -> f32
        %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
        %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
        %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %168 = llvm.mul %10, %5 : i64
        %169 = llvm.add %121, %168 : i64
        %170 = llvm.add %169, %124 : i64
        %171 = llvm.add %170, %77 : i64
        %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %173 = llvm.load %172 : !llvm.ptr -> f32
        %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
        %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
        %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
        %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %179 = llvm.add %77, %13 : i64
        llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb6:  // pred: ^bb4
        %180 = llvm.add %74, %13 : i64
        llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb7:  // pred: ^bb3
        %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %182 = llvm.mul %70, %3 : i64
        %183 = llvm.mul %69, %4 : i64
        %184 = llvm.add %182, %183 : i64
        %185 = llvm.mul %67, %12 : i64
        %186 = llvm.add %184, %185 : i64
        %187 = llvm.add %186, %72 : i64
        %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
        %189 = llvm.add %72, %11 : i64
        llvm.br ^bb2(%189 : i64)
      ^bb8:  // pred: ^bb2
        %190 = llvm.add %70, %13 : i64
        llvm.br ^bb1(%190 : i64)
      ^bb9:  // pred: ^bb1
        llvm.return %0 : i32
      }
    }
  }
}

// -----// IR Dump Before ConvertToHALPass (iree-hal-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?x?x?xf32>{%0, %1, %2, %3} in !stream.resource<external>{%7}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %9 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<4x6x5x5xf32> in !stream.resource<external>{%c2400}
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c2016} => !stream.timepoint
    %10 = arith.index_castui %0 : index to i64
    %11 = arith.trunci %10 : i64 to i32
    %12 = arith.shrui %10, %c32_i64 : i64
    %13 = arith.trunci %12 : i64 to i32
    %14 = arith.index_castui %1 : index to i64
    %15 = arith.trunci %14 : i64 to i32
    %16 = arith.shrui %14, %c32_i64 : i64
    %17 = arith.trunci %16 : i64 to i32
    %18 = arith.index_castui %2 : index to i64
    %19 = arith.trunci %18 : i64 to i32
    %20 = arith.shrui %18, %c32_i64 : i64
    %21 = arith.trunci %20 : i64 to i32
    %22 = arith.index_castui %3 : index to i64
    %23 = arith.trunci %22 : i64 to i32
    %24 = arith.shrui %22, %c32_i64 : i64
    %25 = arith.trunci %24 : i64 to i32
    %26 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%8 as %arg2: !stream.resource<external>{%7}, %9 as %arg3: !stream.resource<external>{%c2400}, %result as %arg4: !stream.resource<external>{%c2016}) {
      stream.cmd.dispatch @main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32[%c2, %c6, %c11, %c13](%11, %13, %15, %17, %19, %21, %23, %25 : i32, i32, i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %7] : !stream.resource<external>{%7},
        ro %arg3[%c0 for %c2400] : !stream.resource<external>{%c2400},
        wo %arg4[%c0 for %c2016] : !stream.resource<external>{%c2016}
      }
    } => !stream.timepoint
    %27 = stream.timepoint.await %26 => %result : !stream.resource<external>{%c2016}
    %28 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %27 : tensor<2x4x7x9xf32> in !stream.resource<external>{%c2016} -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToHALPass (iree-hal-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before InlineMemoizeRegionsPass (iree-hal-inline-memoize-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
  %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
  %c-1_i64 = arith.constant -1 : i64
  %8 = util.null : !hal.fence
  %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %0 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %1 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %2 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %3 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
  %c-1_i64_5 = arith.constant -1 : i64
  %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
  %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
  %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  %c0_6 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2_7 = arith.constant 2 : index
  %c0_8 = arith.constant 0 : index
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
    %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  %c7 = arith.constant 7 : index
  %c4_9 = arith.constant 4 : index
  %c1_10 = arith.constant 1 : index
  %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
  %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
  %c-1_i32 = arith.constant -1 : i32
  %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
  %element_type_f32_13 = hal.element_type<f32> : i32
  %c2_14 = arith.constant 2 : index
  %c4_15 = arith.constant 4 : index
  %c7_16 = arith.constant 7 : index
  %c9 = arith.constant 9 : index
  %c0_17 = arith.constant 0 : index
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After InlineMemoizeRegionsPass (iree-hal-inline-memoize-regions) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %c13 = arith.constant 13 : index
  %c11 = arith.constant 11 : index
  %c2 = arith.constant 2 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
  %4 = arith.muli %0, %c4 : index
  %5 = arith.muli %4, %1 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
  %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
  %c-1_i64 = arith.constant -1 : i64
  %8 = util.null : !hal.fence
  %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %0 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %1 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %2 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %3 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
  %c-1_i64_5 = arith.constant -1 : i64
  %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
  %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
  %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  %c0_6 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2_7 = arith.constant 2 : index
  %c0_8 = arith.constant 0 : index
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
    %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  %c7 = arith.constant 7 : index
  %c4_9 = arith.constant 4 : index
  %c1_10 = arith.constant 1 : index
  %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
  %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
  %c-1_i32 = arith.constant -1 : i32
  %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
  %element_type_f32_13 = hal.element_type<f32> : i32
  %c2_14 = arith.constant 2 : index
  %c4_15 = arith.constant 4 : index
  %c7_16 = arith.constant 7 : index
  %c9 = arith.constant 9 : index
  %c0_17 = arith.constant 0 : index
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before FixupLegacySyncPass (iree-hal-fixup-legacy-sync) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FixupLegacySyncPass (iree-hal-fixup-legacy-sync) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %c13 = arith.constant 13 : index
    %c11 = arith.constant 11 : index
    %c2 = arith.constant 2 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
    %c-1_i64_5 = arith.constant -1 : i64
    %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64_5) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c0_6 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c0_8 = arith.constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_8] bindings([
      %c0_6 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2_7 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %c4_9 = arith.constant 4 : index
    %c1_10 = arith.constant 1 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4_9, %c1_10]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_11 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_11) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_11]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %dense_row_major_12 = hal.encoding_type<dense_row_major> : i32
    %element_type_f32_13 = hal.element_type<f32> : i32
    %c2_14 = arith.constant 2 : index
    %c4_15 = arith.constant 4 : index
    %c7_16 = arith.constant 7 : index
    %c9 = arith.constant 9 : index
    %c0_17 = arith.constant 0 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0_17, %c2016] shape([%c2_14, %c4_15, %c7_16, %c9]) type(%element_type_f32_13) encoding(%dense_row_major_12) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %c9 = arith.constant 9 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1, %2, %3]) type(%element_type_f32) encoding(%dense_row_major)
    %4 = arith.muli %0, %c4 : index
    %5 = arith.muli %4, %1 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%7) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %c-1_i64 = arith.constant -1 : i64
    %8 = util.null : !hal.fence
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%8) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %0 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %1 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %2 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %3 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %25 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %pipeline_layout = hal.pipeline_layout.lookup device(%25 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %7], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %c7 = arith.constant 7 : index
    %exe = hal.executable.lookup device(%25 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %c9 = arith.constant 9 : index
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
  %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) : !hal.pipeline_layout
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
  %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before LinkExecutablesPass (iree-hal-link-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %c7 = arith.constant 7 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c7, %c4, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.undef : vector<3xf32>
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(252 : index) : i64
        %4 = llvm.mlir.constant(63 : index) : i64
        %5 = llvm.mlir.constant(25 : index) : i64
        %6 = llvm.mlir.constant(150 : index) : i64
        %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
        %8 = llvm.mlir.constant(4 : index) : i64
        %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
        %10 = llvm.mlir.constant(5 : index) : i64
        %11 = llvm.mlir.constant(3 : index) : i64
        %12 = llvm.mlir.constant(9 : index) : i64
        %13 = llvm.mlir.constant(1 : index) : i64
        %14 = llvm.mlir.constant(2 : index) : i64
        %15 = llvm.mlir.constant(0 : index) : i64
        %16 = llvm.mlir.constant(32 : i64) : i64
        %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
        %20 = llvm.load %19 : !llvm.ptr -> i32
        %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
        %22 = llvm.load %21 : !llvm.ptr -> i32
        %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
        %24 = llvm.load %23 : !llvm.ptr -> i32
        %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
        %26 = llvm.load %25 : !llvm.ptr -> i32
        %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
        %28 = llvm.load %27 : !llvm.ptr -> i32
        %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
        %30 = llvm.load %29 : !llvm.ptr -> i32
        %31 = llvm.zext %20 : i32 to i64
        %32 = llvm.zext %22 : i32 to i64
        %33 = llvm.shl %32, %16 : i64
        %34 = llvm.or %31, %33  : i64
        %35 = llvm.zext %24 : i32 to i64
        %36 = llvm.zext %26 : i32 to i64
        %37 = llvm.shl %36, %16 : i64
        %38 = llvm.or %35, %37  : i64
        %39 = llvm.zext %28 : i32 to i64
        %40 = llvm.zext %30 : i32 to i64
        %41 = llvm.shl %40, %16 : i64
        %42 = llvm.or %39, %41  : i64
        %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
        %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
        %47 = llvm.and %46, %4  : i64
        %48 = llvm.icmp "eq" %47, %15 : i64
        "llvm.intr.assume"(%48) : (i1) -> ()
        %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
        %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
        %54 = llvm.and %53, %4  : i64
        %55 = llvm.icmp "eq" %54, %15 : i64
        "llvm.intr.assume"(%55) : (i1) -> ()
        %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
        %59 = llvm.mul %42, %2 : i64
        %60 = llvm.mul %59, %38 : i64
        %61 = llvm.mul %60, %34 : i64
        %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
        %63 = llvm.and %62, %4  : i64
        %64 = llvm.icmp "eq" %63, %15 : i64
        "llvm.intr.assume"(%64) : (i1) -> ()
        %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
        %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %67 = llvm.zext %66 : i32 to i64
        %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %69 = llvm.zext %68 : i32 to i64
        llvm.br ^bb1(%15 : i64)
      ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
        %71 = llvm.icmp "slt" %70, %14 : i64
        llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
      ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
        %73 = llvm.icmp "slt" %72, %12 : i64
        llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
      ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
        %76 = llvm.icmp "slt" %74, %10 : i64
        llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
      ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
        %79 = llvm.icmp "slt" %77, %10 : i64
        llvm.cond_br %79, ^bb5, ^bb6
      ^bb5:  // pred: ^bb4
        %80 = llvm.add %67, %74 : i64
        %81 = llvm.add %72, %77 : i64
        %82 = llvm.mul %70, %61 : i64
        %83 = llvm.mul %60, %15 : i64
        %84 = llvm.add %82, %83 : i64
        %85 = llvm.mul %80, %59 : i64
        %86 = llvm.add %84, %85 : i64
        %87 = llvm.add %86, %81 : i64
        %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %90 = llvm.add %80, %13 : i64
        %91 = llvm.mul %90, %59 : i64
        %92 = llvm.add %84, %91 : i64
        %93 = llvm.add %92, %81 : i64
        %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %96 = llvm.add %80, %14 : i64
        %97 = llvm.mul %96, %59 : i64
        %98 = llvm.add %84, %97 : i64
        %99 = llvm.add %98, %81 : i64
        %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %102 = llvm.add %80, %11 : i64
        %103 = llvm.mul %102, %59 : i64
        %104 = llvm.add %84, %103 : i64
        %105 = llvm.add %104, %81 : i64
        %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %108 = llvm.add %80, %8 : i64
        %109 = llvm.mul %108, %59 : i64
        %110 = llvm.add %84, %109 : i64
        %111 = llvm.add %110, %81 : i64
        %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %114 = llvm.add %80, %10 : i64
        %115 = llvm.mul %114, %59 : i64
        %116 = llvm.add %84, %115 : i64
        %117 = llvm.add %116, %81 : i64
        %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %121 = llvm.mul %69, %6 : i64
        %122 = llvm.mul %15, %5 : i64
        %123 = llvm.add %121, %122 : i64
        %124 = llvm.mul %74, %10 : i64
        %125 = llvm.add %123, %124 : i64
        %126 = llvm.add %125, %77 : i64
        %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %128 = llvm.load %127 : !llvm.ptr -> f32
        %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
        %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
        %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %132 = llvm.mul %13, %5 : i64
        %133 = llvm.add %121, %132 : i64
        %134 = llvm.add %133, %124 : i64
        %135 = llvm.add %134, %77 : i64
        %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %137 = llvm.load %136 : !llvm.ptr -> f32
        %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
        %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
        %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %141 = llvm.mul %14, %5 : i64
        %142 = llvm.add %121, %141 : i64
        %143 = llvm.add %142, %124 : i64
        %144 = llvm.add %143, %77 : i64
        %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %146 = llvm.load %145 : !llvm.ptr -> f32
        %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
        %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
        %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %150 = llvm.mul %11, %5 : i64
        %151 = llvm.add %121, %150 : i64
        %152 = llvm.add %151, %124 : i64
        %153 = llvm.add %152, %77 : i64
        %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %155 = llvm.load %154 : !llvm.ptr -> f32
        %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
        %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
        %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %159 = llvm.mul %8, %5 : i64
        %160 = llvm.add %121, %159 : i64
        %161 = llvm.add %160, %124 : i64
        %162 = llvm.add %161, %77 : i64
        %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %164 = llvm.load %163 : !llvm.ptr -> f32
        %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
        %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
        %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %168 = llvm.mul %10, %5 : i64
        %169 = llvm.add %121, %168 : i64
        %170 = llvm.add %169, %124 : i64
        %171 = llvm.add %170, %77 : i64
        %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %173 = llvm.load %172 : !llvm.ptr -> f32
        %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
        %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
        %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
        %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %179 = llvm.add %77, %13 : i64
        llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb6:  // pred: ^bb4
        %180 = llvm.add %74, %13 : i64
        llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb7:  // pred: ^bb3
        %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %182 = llvm.mul %70, %3 : i64
        %183 = llvm.mul %69, %4 : i64
        %184 = llvm.add %182, %183 : i64
        %185 = llvm.mul %67, %12 : i64
        %186 = llvm.add %184, %185 : i64
        %187 = llvm.add %186, %72 : i64
        %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
        %189 = llvm.add %72, %11 : i64
        llvm.br ^bb2(%189 : i64)
      ^bb8:  // pred: ^bb2
        %190 = llvm.add %70, %13 : i64
        llvm.br ^bb1(%190 : i64)
      ^bb9:  // pred: ^bb1
        llvm.return %0 : i32
      }
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %c7 = arith.constant 7 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c7, %c4, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.undef : vector<3xf32>
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(252 : index) : i64
        %4 = llvm.mlir.constant(63 : index) : i64
        %5 = llvm.mlir.constant(25 : index) : i64
        %6 = llvm.mlir.constant(150 : index) : i64
        %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
        %8 = llvm.mlir.constant(4 : index) : i64
        %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
        %10 = llvm.mlir.constant(5 : index) : i64
        %11 = llvm.mlir.constant(3 : index) : i64
        %12 = llvm.mlir.constant(9 : index) : i64
        %13 = llvm.mlir.constant(1 : index) : i64
        %14 = llvm.mlir.constant(2 : index) : i64
        %15 = llvm.mlir.constant(0 : index) : i64
        %16 = llvm.mlir.constant(32 : i64) : i64
        %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
        %20 = llvm.load %19 : !llvm.ptr -> i32
        %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
        %22 = llvm.load %21 : !llvm.ptr -> i32
        %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
        %24 = llvm.load %23 : !llvm.ptr -> i32
        %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
        %26 = llvm.load %25 : !llvm.ptr -> i32
        %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
        %28 = llvm.load %27 : !llvm.ptr -> i32
        %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
        %30 = llvm.load %29 : !llvm.ptr -> i32
        %31 = llvm.zext %20 : i32 to i64
        %32 = llvm.zext %22 : i32 to i64
        %33 = llvm.shl %32, %16 : i64
        %34 = llvm.or %31, %33  : i64
        %35 = llvm.zext %24 : i32 to i64
        %36 = llvm.zext %26 : i32 to i64
        %37 = llvm.shl %36, %16 : i64
        %38 = llvm.or %35, %37  : i64
        %39 = llvm.zext %28 : i32 to i64
        %40 = llvm.zext %30 : i32 to i64
        %41 = llvm.shl %40, %16 : i64
        %42 = llvm.or %39, %41  : i64
        %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
        %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
        %47 = llvm.and %46, %4  : i64
        %48 = llvm.icmp "eq" %47, %15 : i64
        "llvm.intr.assume"(%48) : (i1) -> ()
        %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
        %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
        %54 = llvm.and %53, %4  : i64
        %55 = llvm.icmp "eq" %54, %15 : i64
        "llvm.intr.assume"(%55) : (i1) -> ()
        %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
        %59 = llvm.mul %42, %2 : i64
        %60 = llvm.mul %59, %38 : i64
        %61 = llvm.mul %60, %34 : i64
        %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
        %63 = llvm.and %62, %4  : i64
        %64 = llvm.icmp "eq" %63, %15 : i64
        "llvm.intr.assume"(%64) : (i1) -> ()
        %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
        %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %67 = llvm.zext %66 : i32 to i64
        %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %69 = llvm.zext %68 : i32 to i64
        llvm.br ^bb1(%15 : i64)
      ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
        %71 = llvm.icmp "slt" %70, %14 : i64
        llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
      ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
        %73 = llvm.icmp "slt" %72, %12 : i64
        llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
      ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
        %76 = llvm.icmp "slt" %74, %10 : i64
        llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
      ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
        %79 = llvm.icmp "slt" %77, %10 : i64
        llvm.cond_br %79, ^bb5, ^bb6
      ^bb5:  // pred: ^bb4
        %80 = llvm.add %67, %74 : i64
        %81 = llvm.add %72, %77 : i64
        %82 = llvm.mul %70, %61 : i64
        %83 = llvm.mul %60, %15 : i64
        %84 = llvm.add %82, %83 : i64
        %85 = llvm.mul %80, %59 : i64
        %86 = llvm.add %84, %85 : i64
        %87 = llvm.add %86, %81 : i64
        %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %90 = llvm.add %80, %13 : i64
        %91 = llvm.mul %90, %59 : i64
        %92 = llvm.add %84, %91 : i64
        %93 = llvm.add %92, %81 : i64
        %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %96 = llvm.add %80, %14 : i64
        %97 = llvm.mul %96, %59 : i64
        %98 = llvm.add %84, %97 : i64
        %99 = llvm.add %98, %81 : i64
        %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %102 = llvm.add %80, %11 : i64
        %103 = llvm.mul %102, %59 : i64
        %104 = llvm.add %84, %103 : i64
        %105 = llvm.add %104, %81 : i64
        %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %108 = llvm.add %80, %8 : i64
        %109 = llvm.mul %108, %59 : i64
        %110 = llvm.add %84, %109 : i64
        %111 = llvm.add %110, %81 : i64
        %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %114 = llvm.add %80, %10 : i64
        %115 = llvm.mul %114, %59 : i64
        %116 = llvm.add %84, %115 : i64
        %117 = llvm.add %116, %81 : i64
        %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %121 = llvm.mul %69, %6 : i64
        %122 = llvm.mul %15, %5 : i64
        %123 = llvm.add %121, %122 : i64
        %124 = llvm.mul %74, %10 : i64
        %125 = llvm.add %123, %124 : i64
        %126 = llvm.add %125, %77 : i64
        %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %128 = llvm.load %127 : !llvm.ptr -> f32
        %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
        %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
        %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %132 = llvm.mul %13, %5 : i64
        %133 = llvm.add %121, %132 : i64
        %134 = llvm.add %133, %124 : i64
        %135 = llvm.add %134, %77 : i64
        %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %137 = llvm.load %136 : !llvm.ptr -> f32
        %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
        %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
        %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %141 = llvm.mul %14, %5 : i64
        %142 = llvm.add %121, %141 : i64
        %143 = llvm.add %142, %124 : i64
        %144 = llvm.add %143, %77 : i64
        %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %146 = llvm.load %145 : !llvm.ptr -> f32
        %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
        %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
        %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %150 = llvm.mul %11, %5 : i64
        %151 = llvm.add %121, %150 : i64
        %152 = llvm.add %151, %124 : i64
        %153 = llvm.add %152, %77 : i64
        %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %155 = llvm.load %154 : !llvm.ptr -> f32
        %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
        %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
        %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %159 = llvm.mul %8, %5 : i64
        %160 = llvm.add %121, %159 : i64
        %161 = llvm.add %160, %124 : i64
        %162 = llvm.add %161, %77 : i64
        %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %164 = llvm.load %163 : !llvm.ptr -> f32
        %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
        %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
        %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %168 = llvm.mul %10, %5 : i64
        %169 = llvm.add %121, %168 : i64
        %170 = llvm.add %169, %124 : i64
        %171 = llvm.add %170, %77 : i64
        %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %173 = llvm.load %172 : !llvm.ptr -> f32
        %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
        %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
        %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
        %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %179 = llvm.add %77, %13 : i64
        llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb6:  // pred: ^bb4
        %180 = llvm.add %74, %13 : i64
        llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb7:  // pred: ^bb3
        %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %182 = llvm.mul %70, %3 : i64
        %183 = llvm.mul %69, %4 : i64
        %184 = llvm.add %182, %183 : i64
        %185 = llvm.mul %67, %12 : i64
        %186 = llvm.add %184, %185 : i64
        %187 = llvm.add %186, %72 : i64
        %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
        %189 = llvm.add %72, %11 : i64
        llvm.br ^bb2(%189 : i64)
      ^bb8:  // pred: ^bb2
        %190 = llvm.add %70, %13 : i64
        llvm.br ^bb1(%190 : i64)
      ^bb9:  // pred: ^bb1
        llvm.return %0 : i32
      }
    }
  }
}

// -----// IR Dump Before LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %c7 = arith.constant 7 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c7, %c4, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.undef : vector<3xf32>
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(252 : index) : i64
      %4 = llvm.mlir.constant(63 : index) : i64
      %5 = llvm.mlir.constant(25 : index) : i64
      %6 = llvm.mlir.constant(150 : index) : i64
      %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
      %8 = llvm.mlir.constant(4 : index) : i64
      %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
      %10 = llvm.mlir.constant(5 : index) : i64
      %11 = llvm.mlir.constant(3 : index) : i64
      %12 = llvm.mlir.constant(9 : index) : i64
      %13 = llvm.mlir.constant(1 : index) : i64
      %14 = llvm.mlir.constant(2 : index) : i64
      %15 = llvm.mlir.constant(0 : index) : i64
      %16 = llvm.mlir.constant(32 : i64) : i64
      %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
      %20 = llvm.load %19 : !llvm.ptr -> i32
      %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
      %22 = llvm.load %21 : !llvm.ptr -> i32
      %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
      %24 = llvm.load %23 : !llvm.ptr -> i32
      %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
      %26 = llvm.load %25 : !llvm.ptr -> i32
      %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
      %28 = llvm.load %27 : !llvm.ptr -> i32
      %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
      %30 = llvm.load %29 : !llvm.ptr -> i32
      %31 = llvm.zext %20 : i32 to i64
      %32 = llvm.zext %22 : i32 to i64
      %33 = llvm.shl %32, %16 : i64
      %34 = llvm.or %31, %33  : i64
      %35 = llvm.zext %24 : i32 to i64
      %36 = llvm.zext %26 : i32 to i64
      %37 = llvm.shl %36, %16 : i64
      %38 = llvm.or %35, %37  : i64
      %39 = llvm.zext %28 : i32 to i64
      %40 = llvm.zext %30 : i32 to i64
      %41 = llvm.shl %40, %16 : i64
      %42 = llvm.or %39, %41  : i64
      %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
      %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
      %47 = llvm.and %46, %4  : i64
      %48 = llvm.icmp "eq" %47, %15 : i64
      "llvm.intr.assume"(%48) : (i1) -> ()
      %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
      %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
      %54 = llvm.and %53, %4  : i64
      %55 = llvm.icmp "eq" %54, %15 : i64
      "llvm.intr.assume"(%55) : (i1) -> ()
      %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
      %59 = llvm.mul %42, %2 : i64
      %60 = llvm.mul %59, %38 : i64
      %61 = llvm.mul %60, %34 : i64
      %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
      %63 = llvm.and %62, %4  : i64
      %64 = llvm.icmp "eq" %63, %15 : i64
      "llvm.intr.assume"(%64) : (i1) -> ()
      %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
      %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %67 = llvm.zext %66 : i32 to i64
      %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %69 = llvm.zext %68 : i32 to i64
      llvm.br ^bb1(%15 : i64)
    ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
      %71 = llvm.icmp "slt" %70, %14 : i64
      llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
    ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
      %73 = llvm.icmp "slt" %72, %12 : i64
      llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
    ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
      %76 = llvm.icmp "slt" %74, %10 : i64
      llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
    ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
      %79 = llvm.icmp "slt" %77, %10 : i64
      llvm.cond_br %79, ^bb5, ^bb6
    ^bb5:  // pred: ^bb4
      %80 = llvm.add %67, %74 : i64
      %81 = llvm.add %72, %77 : i64
      %82 = llvm.mul %70, %61 : i64
      %83 = llvm.mul %60, %15 : i64
      %84 = llvm.add %82, %83 : i64
      %85 = llvm.mul %80, %59 : i64
      %86 = llvm.add %84, %85 : i64
      %87 = llvm.add %86, %81 : i64
      %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %90 = llvm.add %80, %13 : i64
      %91 = llvm.mul %90, %59 : i64
      %92 = llvm.add %84, %91 : i64
      %93 = llvm.add %92, %81 : i64
      %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %96 = llvm.add %80, %14 : i64
      %97 = llvm.mul %96, %59 : i64
      %98 = llvm.add %84, %97 : i64
      %99 = llvm.add %98, %81 : i64
      %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %102 = llvm.add %80, %11 : i64
      %103 = llvm.mul %102, %59 : i64
      %104 = llvm.add %84, %103 : i64
      %105 = llvm.add %104, %81 : i64
      %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %108 = llvm.add %80, %8 : i64
      %109 = llvm.mul %108, %59 : i64
      %110 = llvm.add %84, %109 : i64
      %111 = llvm.add %110, %81 : i64
      %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %114 = llvm.add %80, %10 : i64
      %115 = llvm.mul %114, %59 : i64
      %116 = llvm.add %84, %115 : i64
      %117 = llvm.add %116, %81 : i64
      %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %121 = llvm.mul %69, %6 : i64
      %122 = llvm.mul %15, %5 : i64
      %123 = llvm.add %121, %122 : i64
      %124 = llvm.mul %74, %10 : i64
      %125 = llvm.add %123, %124 : i64
      %126 = llvm.add %125, %77 : i64
      %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %128 = llvm.load %127 : !llvm.ptr -> f32
      %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
      %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
      %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %132 = llvm.mul %13, %5 : i64
      %133 = llvm.add %121, %132 : i64
      %134 = llvm.add %133, %124 : i64
      %135 = llvm.add %134, %77 : i64
      %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %137 = llvm.load %136 : !llvm.ptr -> f32
      %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
      %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
      %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %141 = llvm.mul %14, %5 : i64
      %142 = llvm.add %121, %141 : i64
      %143 = llvm.add %142, %124 : i64
      %144 = llvm.add %143, %77 : i64
      %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %146 = llvm.load %145 : !llvm.ptr -> f32
      %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
      %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
      %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %150 = llvm.mul %11, %5 : i64
      %151 = llvm.add %121, %150 : i64
      %152 = llvm.add %151, %124 : i64
      %153 = llvm.add %152, %77 : i64
      %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %155 = llvm.load %154 : !llvm.ptr -> f32
      %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
      %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
      %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %159 = llvm.mul %8, %5 : i64
      %160 = llvm.add %121, %159 : i64
      %161 = llvm.add %160, %124 : i64
      %162 = llvm.add %161, %77 : i64
      %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %164 = llvm.load %163 : !llvm.ptr -> f32
      %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
      %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
      %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %168 = llvm.mul %10, %5 : i64
      %169 = llvm.add %121, %168 : i64
      %170 = llvm.add %169, %124 : i64
      %171 = llvm.add %170, %77 : i64
      %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %173 = llvm.load %172 : !llvm.ptr -> f32
      %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
      %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
      %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
      %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %179 = llvm.add %77, %13 : i64
      llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb6:  // pred: ^bb4
      %180 = llvm.add %74, %13 : i64
      llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb7:  // pred: ^bb3
      %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %182 = llvm.mul %70, %3 : i64
      %183 = llvm.mul %69, %4 : i64
      %184 = llvm.add %182, %183 : i64
      %185 = llvm.mul %67, %12 : i64
      %186 = llvm.add %184, %185 : i64
      %187 = llvm.add %186, %72 : i64
      %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
      %189 = llvm.add %72, %11 : i64
      llvm.br ^bb2(%189 : i64)
    ^bb8:  // pred: ^bb2
      %190 = llvm.add %70, %13 : i64
      llvm.br ^bb1(%190 : i64)
    ^bb9:  // pred: ^bb1
      llvm.return %0 : i32
    }
  }
}

// -----// IR Dump After LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %c7 = arith.constant 7 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c7, %c4, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.undef : vector<3xf32>
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(252 : index) : i64
      %4 = llvm.mlir.constant(63 : index) : i64
      %5 = llvm.mlir.constant(25 : index) : i64
      %6 = llvm.mlir.constant(150 : index) : i64
      %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
      %8 = llvm.mlir.constant(4 : index) : i64
      %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
      %10 = llvm.mlir.constant(5 : index) : i64
      %11 = llvm.mlir.constant(3 : index) : i64
      %12 = llvm.mlir.constant(9 : index) : i64
      %13 = llvm.mlir.constant(1 : index) : i64
      %14 = llvm.mlir.constant(2 : index) : i64
      %15 = llvm.mlir.constant(0 : index) : i64
      %16 = llvm.mlir.constant(32 : i64) : i64
      %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
      %20 = llvm.load %19 : !llvm.ptr -> i32
      %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
      %22 = llvm.load %21 : !llvm.ptr -> i32
      %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
      %24 = llvm.load %23 : !llvm.ptr -> i32
      %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
      %26 = llvm.load %25 : !llvm.ptr -> i32
      %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
      %28 = llvm.load %27 : !llvm.ptr -> i32
      %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
      %30 = llvm.load %29 : !llvm.ptr -> i32
      %31 = llvm.zext %20 : i32 to i64
      %32 = llvm.zext %22 : i32 to i64
      %33 = llvm.shl %32, %16 : i64
      %34 = llvm.or %31, %33  : i64
      %35 = llvm.zext %24 : i32 to i64
      %36 = llvm.zext %26 : i32 to i64
      %37 = llvm.shl %36, %16 : i64
      %38 = llvm.or %35, %37  : i64
      %39 = llvm.zext %28 : i32 to i64
      %40 = llvm.zext %30 : i32 to i64
      %41 = llvm.shl %40, %16 : i64
      %42 = llvm.or %39, %41  : i64
      %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
      %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
      %47 = llvm.and %46, %4  : i64
      %48 = llvm.icmp "eq" %47, %15 : i64
      "llvm.intr.assume"(%48) : (i1) -> ()
      %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
      %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
      %54 = llvm.and %53, %4  : i64
      %55 = llvm.icmp "eq" %54, %15 : i64
      "llvm.intr.assume"(%55) : (i1) -> ()
      %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
      %59 = llvm.mul %42, %2 : i64
      %60 = llvm.mul %59, %38 : i64
      %61 = llvm.mul %60, %34 : i64
      %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
      %63 = llvm.and %62, %4  : i64
      %64 = llvm.icmp "eq" %63, %15 : i64
      "llvm.intr.assume"(%64) : (i1) -> ()
      %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
      %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %67 = llvm.zext %66 : i32 to i64
      %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %69 = llvm.zext %68 : i32 to i64
      llvm.br ^bb1(%15 : i64)
    ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
      %71 = llvm.icmp "slt" %70, %14 : i64
      llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
    ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
      %73 = llvm.icmp "slt" %72, %12 : i64
      llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
    ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
      %76 = llvm.icmp "slt" %74, %10 : i64
      llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
    ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
      %79 = llvm.icmp "slt" %77, %10 : i64
      llvm.cond_br %79, ^bb5, ^bb6
    ^bb5:  // pred: ^bb4
      %80 = llvm.add %67, %74 : i64
      %81 = llvm.add %72, %77 : i64
      %82 = llvm.mul %70, %61 : i64
      %83 = llvm.mul %60, %15 : i64
      %84 = llvm.add %82, %83 : i64
      %85 = llvm.mul %80, %59 : i64
      %86 = llvm.add %84, %85 : i64
      %87 = llvm.add %86, %81 : i64
      %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %90 = llvm.add %80, %13 : i64
      %91 = llvm.mul %90, %59 : i64
      %92 = llvm.add %84, %91 : i64
      %93 = llvm.add %92, %81 : i64
      %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %96 = llvm.add %80, %14 : i64
      %97 = llvm.mul %96, %59 : i64
      %98 = llvm.add %84, %97 : i64
      %99 = llvm.add %98, %81 : i64
      %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %102 = llvm.add %80, %11 : i64
      %103 = llvm.mul %102, %59 : i64
      %104 = llvm.add %84, %103 : i64
      %105 = llvm.add %104, %81 : i64
      %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %108 = llvm.add %80, %8 : i64
      %109 = llvm.mul %108, %59 : i64
      %110 = llvm.add %84, %109 : i64
      %111 = llvm.add %110, %81 : i64
      %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %114 = llvm.add %80, %10 : i64
      %115 = llvm.mul %114, %59 : i64
      %116 = llvm.add %84, %115 : i64
      %117 = llvm.add %116, %81 : i64
      %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %121 = llvm.mul %69, %6 : i64
      %122 = llvm.mul %15, %5 : i64
      %123 = llvm.add %121, %122 : i64
      %124 = llvm.mul %74, %10 : i64
      %125 = llvm.add %123, %124 : i64
      %126 = llvm.add %125, %77 : i64
      %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %128 = llvm.load %127 : !llvm.ptr -> f32
      %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
      %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
      %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %132 = llvm.mul %13, %5 : i64
      %133 = llvm.add %121, %132 : i64
      %134 = llvm.add %133, %124 : i64
      %135 = llvm.add %134, %77 : i64
      %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %137 = llvm.load %136 : !llvm.ptr -> f32
      %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
      %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
      %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %141 = llvm.mul %14, %5 : i64
      %142 = llvm.add %121, %141 : i64
      %143 = llvm.add %142, %124 : i64
      %144 = llvm.add %143, %77 : i64
      %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %146 = llvm.load %145 : !llvm.ptr -> f32
      %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
      %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
      %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %150 = llvm.mul %11, %5 : i64
      %151 = llvm.add %121, %150 : i64
      %152 = llvm.add %151, %124 : i64
      %153 = llvm.add %152, %77 : i64
      %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %155 = llvm.load %154 : !llvm.ptr -> f32
      %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
      %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
      %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %159 = llvm.mul %8, %5 : i64
      %160 = llvm.add %121, %159 : i64
      %161 = llvm.add %160, %124 : i64
      %162 = llvm.add %161, %77 : i64
      %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %164 = llvm.load %163 : !llvm.ptr -> f32
      %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
      %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
      %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %168 = llvm.mul %10, %5 : i64
      %169 = llvm.add %121, %168 : i64
      %170 = llvm.add %169, %124 : i64
      %171 = llvm.add %170, %77 : i64
      %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %173 = llvm.load %172 : !llvm.ptr -> f32
      %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
      %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
      %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
      %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %179 = llvm.add %77, %13 : i64
      llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb6:  // pred: ^bb4
      %180 = llvm.add %74, %13 : i64
      llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb7:  // pred: ^bb3
      %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %182 = llvm.mul %70, %3 : i64
      %183 = llvm.mul %69, %4 : i64
      %184 = llvm.add %182, %183 : i64
      %185 = llvm.mul %67, %12 : i64
      %186 = llvm.add %184, %185 : i64
      %187 = llvm.add %186, %72 : i64
      %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
      %189 = llvm.add %72, %11 : i64
      llvm.br ^bb2(%189 : i64)
    ^bb8:  // pred: ^bb2
      %190 = llvm.add %70, %13 : i64
      llvm.br ^bb1(%190 : i64)
    ^bb9:  // pred: ^bb1
      llvm.return %0 : i32
    }
  }
}

// -----// IR Dump Before LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %c7 = arith.constant 7 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c7, %c4, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.undef : vector<3xf32>
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(252 : index) : i64
      %4 = llvm.mlir.constant(63 : index) : i64
      %5 = llvm.mlir.constant(25 : index) : i64
      %6 = llvm.mlir.constant(150 : index) : i64
      %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
      %8 = llvm.mlir.constant(4 : index) : i64
      %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
      %10 = llvm.mlir.constant(5 : index) : i64
      %11 = llvm.mlir.constant(3 : index) : i64
      %12 = llvm.mlir.constant(9 : index) : i64
      %13 = llvm.mlir.constant(1 : index) : i64
      %14 = llvm.mlir.constant(2 : index) : i64
      %15 = llvm.mlir.constant(0 : index) : i64
      %16 = llvm.mlir.constant(32 : i64) : i64
      %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
      %20 = llvm.load %19 : !llvm.ptr -> i32
      %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
      %22 = llvm.load %21 : !llvm.ptr -> i32
      %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
      %24 = llvm.load %23 : !llvm.ptr -> i32
      %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
      %26 = llvm.load %25 : !llvm.ptr -> i32
      %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
      %28 = llvm.load %27 : !llvm.ptr -> i32
      %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
      %30 = llvm.load %29 : !llvm.ptr -> i32
      %31 = llvm.zext %20 : i32 to i64
      %32 = llvm.zext %22 : i32 to i64
      %33 = llvm.shl %32, %16 : i64
      %34 = llvm.or %31, %33  : i64
      %35 = llvm.zext %24 : i32 to i64
      %36 = llvm.zext %26 : i32 to i64
      %37 = llvm.shl %36, %16 : i64
      %38 = llvm.or %35, %37  : i64
      %39 = llvm.zext %28 : i32 to i64
      %40 = llvm.zext %30 : i32 to i64
      %41 = llvm.shl %40, %16 : i64
      %42 = llvm.or %39, %41  : i64
      %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
      %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
      %47 = llvm.and %46, %4  : i64
      %48 = llvm.icmp "eq" %47, %15 : i64
      "llvm.intr.assume"(%48) : (i1) -> ()
      %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
      %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
      %54 = llvm.and %53, %4  : i64
      %55 = llvm.icmp "eq" %54, %15 : i64
      "llvm.intr.assume"(%55) : (i1) -> ()
      %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
      %59 = llvm.mul %42, %2 : i64
      %60 = llvm.mul %59, %38 : i64
      %61 = llvm.mul %60, %34 : i64
      %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
      %63 = llvm.and %62, %4  : i64
      %64 = llvm.icmp "eq" %63, %15 : i64
      "llvm.intr.assume"(%64) : (i1) -> ()
      %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
      %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %67 = llvm.zext %66 : i32 to i64
      %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %69 = llvm.zext %68 : i32 to i64
      llvm.br ^bb1(%15 : i64)
    ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
      %71 = llvm.icmp "slt" %70, %14 : i64
      llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
    ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
      %73 = llvm.icmp "slt" %72, %12 : i64
      llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
    ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
      %76 = llvm.icmp "slt" %74, %10 : i64
      llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
    ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
      %79 = llvm.icmp "slt" %77, %10 : i64
      llvm.cond_br %79, ^bb5, ^bb6
    ^bb5:  // pred: ^bb4
      %80 = llvm.add %67, %74 : i64
      %81 = llvm.add %72, %77 : i64
      %82 = llvm.mul %70, %61 : i64
      %83 = llvm.mul %60, %15 : i64
      %84 = llvm.add %82, %83 : i64
      %85 = llvm.mul %80, %59 : i64
      %86 = llvm.add %84, %85 : i64
      %87 = llvm.add %86, %81 : i64
      %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %90 = llvm.add %80, %13 : i64
      %91 = llvm.mul %90, %59 : i64
      %92 = llvm.add %84, %91 : i64
      %93 = llvm.add %92, %81 : i64
      %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %96 = llvm.add %80, %14 : i64
      %97 = llvm.mul %96, %59 : i64
      %98 = llvm.add %84, %97 : i64
      %99 = llvm.add %98, %81 : i64
      %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %102 = llvm.add %80, %11 : i64
      %103 = llvm.mul %102, %59 : i64
      %104 = llvm.add %84, %103 : i64
      %105 = llvm.add %104, %81 : i64
      %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %108 = llvm.add %80, %8 : i64
      %109 = llvm.mul %108, %59 : i64
      %110 = llvm.add %84, %109 : i64
      %111 = llvm.add %110, %81 : i64
      %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %114 = llvm.add %80, %10 : i64
      %115 = llvm.mul %114, %59 : i64
      %116 = llvm.add %84, %115 : i64
      %117 = llvm.add %116, %81 : i64
      %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %121 = llvm.mul %69, %6 : i64
      %122 = llvm.mul %15, %5 : i64
      %123 = llvm.add %121, %122 : i64
      %124 = llvm.mul %74, %10 : i64
      %125 = llvm.add %123, %124 : i64
      %126 = llvm.add %125, %77 : i64
      %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %128 = llvm.load %127 : !llvm.ptr -> f32
      %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
      %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
      %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %132 = llvm.mul %13, %5 : i64
      %133 = llvm.add %121, %132 : i64
      %134 = llvm.add %133, %124 : i64
      %135 = llvm.add %134, %77 : i64
      %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %137 = llvm.load %136 : !llvm.ptr -> f32
      %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
      %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
      %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %141 = llvm.mul %14, %5 : i64
      %142 = llvm.add %121, %141 : i64
      %143 = llvm.add %142, %124 : i64
      %144 = llvm.add %143, %77 : i64
      %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %146 = llvm.load %145 : !llvm.ptr -> f32
      %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
      %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
      %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %150 = llvm.mul %11, %5 : i64
      %151 = llvm.add %121, %150 : i64
      %152 = llvm.add %151, %124 : i64
      %153 = llvm.add %152, %77 : i64
      %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %155 = llvm.load %154 : !llvm.ptr -> f32
      %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
      %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
      %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %159 = llvm.mul %8, %5 : i64
      %160 = llvm.add %121, %159 : i64
      %161 = llvm.add %160, %124 : i64
      %162 = llvm.add %161, %77 : i64
      %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %164 = llvm.load %163 : !llvm.ptr -> f32
      %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
      %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
      %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %168 = llvm.mul %10, %5 : i64
      %169 = llvm.add %121, %168 : i64
      %170 = llvm.add %169, %124 : i64
      %171 = llvm.add %170, %77 : i64
      %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %173 = llvm.load %172 : !llvm.ptr -> f32
      %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
      %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
      %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
      %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %179 = llvm.add %77, %13 : i64
      llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb6:  // pred: ^bb4
      %180 = llvm.add %74, %13 : i64
      llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb7:  // pred: ^bb3
      %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %182 = llvm.mul %70, %3 : i64
      %183 = llvm.mul %69, %4 : i64
      %184 = llvm.add %182, %183 : i64
      %185 = llvm.mul %67, %12 : i64
      %186 = llvm.add %184, %185 : i64
      %187 = llvm.add %186, %72 : i64
      %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
      %189 = llvm.add %72, %11 : i64
      llvm.br ^bb2(%189 : i64)
    ^bb8:  // pred: ^bb2
      %190 = llvm.add %70, %13 : i64
      llvm.br ^bb1(%190 : i64)
    ^bb9:  // pred: ^bb1
      llvm.return %0 : i32
    }
  }
}

// -----// IR Dump After LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
  hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
    %c7 = arith.constant 7 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    hal.return %c7, %c4, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.undef : vector<3xf32>
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(252 : index) : i64
      %4 = llvm.mlir.constant(63 : index) : i64
      %5 = llvm.mlir.constant(25 : index) : i64
      %6 = llvm.mlir.constant(150 : index) : i64
      %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
      %8 = llvm.mlir.constant(4 : index) : i64
      %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
      %10 = llvm.mlir.constant(5 : index) : i64
      %11 = llvm.mlir.constant(3 : index) : i64
      %12 = llvm.mlir.constant(9 : index) : i64
      %13 = llvm.mlir.constant(1 : index) : i64
      %14 = llvm.mlir.constant(2 : index) : i64
      %15 = llvm.mlir.constant(0 : index) : i64
      %16 = llvm.mlir.constant(32 : i64) : i64
      %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
      %20 = llvm.load %19 : !llvm.ptr -> i32
      %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
      %22 = llvm.load %21 : !llvm.ptr -> i32
      %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
      %24 = llvm.load %23 : !llvm.ptr -> i32
      %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
      %26 = llvm.load %25 : !llvm.ptr -> i32
      %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
      %28 = llvm.load %27 : !llvm.ptr -> i32
      %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
      %30 = llvm.load %29 : !llvm.ptr -> i32
      %31 = llvm.zext %20 : i32 to i64
      %32 = llvm.zext %22 : i32 to i64
      %33 = llvm.shl %32, %16 : i64
      %34 = llvm.or %31, %33  : i64
      %35 = llvm.zext %24 : i32 to i64
      %36 = llvm.zext %26 : i32 to i64
      %37 = llvm.shl %36, %16 : i64
      %38 = llvm.or %35, %37  : i64
      %39 = llvm.zext %28 : i32 to i64
      %40 = llvm.zext %30 : i32 to i64
      %41 = llvm.shl %40, %16 : i64
      %42 = llvm.or %39, %41  : i64
      %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
      %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
      %47 = llvm.and %46, %4  : i64
      %48 = llvm.icmp "eq" %47, %15 : i64
      "llvm.intr.assume"(%48) : (i1) -> ()
      %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
      %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
      %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
      %54 = llvm.and %53, %4  : i64
      %55 = llvm.icmp "eq" %54, %15 : i64
      "llvm.intr.assume"(%55) : (i1) -> ()
      %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
      %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
      %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
      %59 = llvm.mul %42, %2 : i64
      %60 = llvm.mul %59, %38 : i64
      %61 = llvm.mul %60, %34 : i64
      %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
      %63 = llvm.and %62, %4  : i64
      %64 = llvm.icmp "eq" %63, %15 : i64
      "llvm.intr.assume"(%64) : (i1) -> ()
      %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
      %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %67 = llvm.zext %66 : i32 to i64
      %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
      %69 = llvm.zext %68 : i32 to i64
      llvm.br ^bb1(%15 : i64)
    ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
      %71 = llvm.icmp "slt" %70, %14 : i64
      llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
    ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
      %73 = llvm.icmp "slt" %72, %12 : i64
      llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
    ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
      %76 = llvm.icmp "slt" %74, %10 : i64
      llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
    ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
      %79 = llvm.icmp "slt" %77, %10 : i64
      llvm.cond_br %79, ^bb5, ^bb6
    ^bb5:  // pred: ^bb4
      %80 = llvm.add %67, %74 : i64
      %81 = llvm.add %72, %77 : i64
      %82 = llvm.mul %70, %61 : i64
      %83 = llvm.mul %60, %15 : i64
      %84 = llvm.add %82, %83 : i64
      %85 = llvm.mul %80, %59 : i64
      %86 = llvm.add %84, %85 : i64
      %87 = llvm.add %86, %81 : i64
      %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %90 = llvm.add %80, %13 : i64
      %91 = llvm.mul %90, %59 : i64
      %92 = llvm.add %84, %91 : i64
      %93 = llvm.add %92, %81 : i64
      %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %96 = llvm.add %80, %14 : i64
      %97 = llvm.mul %96, %59 : i64
      %98 = llvm.add %84, %97 : i64
      %99 = llvm.add %98, %81 : i64
      %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %102 = llvm.add %80, %11 : i64
      %103 = llvm.mul %102, %59 : i64
      %104 = llvm.add %84, %103 : i64
      %105 = llvm.add %104, %81 : i64
      %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %108 = llvm.add %80, %8 : i64
      %109 = llvm.mul %108, %59 : i64
      %110 = llvm.add %84, %109 : i64
      %111 = llvm.add %110, %81 : i64
      %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %114 = llvm.add %80, %10 : i64
      %115 = llvm.mul %114, %59 : i64
      %116 = llvm.add %84, %115 : i64
      %117 = llvm.add %116, %81 : i64
      %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
      %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %121 = llvm.mul %69, %6 : i64
      %122 = llvm.mul %15, %5 : i64
      %123 = llvm.add %121, %122 : i64
      %124 = llvm.mul %74, %10 : i64
      %125 = llvm.add %123, %124 : i64
      %126 = llvm.add %125, %77 : i64
      %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %128 = llvm.load %127 : !llvm.ptr -> f32
      %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
      %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
      %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %132 = llvm.mul %13, %5 : i64
      %133 = llvm.add %121, %132 : i64
      %134 = llvm.add %133, %124 : i64
      %135 = llvm.add %134, %77 : i64
      %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %137 = llvm.load %136 : !llvm.ptr -> f32
      %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
      %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
      %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %141 = llvm.mul %14, %5 : i64
      %142 = llvm.add %121, %141 : i64
      %143 = llvm.add %142, %124 : i64
      %144 = llvm.add %143, %77 : i64
      %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %146 = llvm.load %145 : !llvm.ptr -> f32
      %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
      %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
      %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %150 = llvm.mul %11, %5 : i64
      %151 = llvm.add %121, %150 : i64
      %152 = llvm.add %151, %124 : i64
      %153 = llvm.add %152, %77 : i64
      %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %155 = llvm.load %154 : !llvm.ptr -> f32
      %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
      %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
      %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %159 = llvm.mul %8, %5 : i64
      %160 = llvm.add %121, %159 : i64
      %161 = llvm.add %160, %124 : i64
      %162 = llvm.add %161, %77 : i64
      %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %164 = llvm.load %163 : !llvm.ptr -> f32
      %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
      %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
      %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %168 = llvm.mul %10, %5 : i64
      %169 = llvm.add %121, %168 : i64
      %170 = llvm.add %169, %124 : i64
      %171 = llvm.add %170, %77 : i64
      %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %173 = llvm.load %172 : !llvm.ptr -> f32
      %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
      %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
      %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
      %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
      %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %179 = llvm.add %77, %13 : i64
      llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb6:  // pred: ^bb4
      %180 = llvm.add %74, %13 : i64
      llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
    ^bb7:  // pred: ^bb3
      %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
      %182 = llvm.mul %70, %3 : i64
      %183 = llvm.mul %69, %4 : i64
      %184 = llvm.add %182, %183 : i64
      %185 = llvm.mul %67, %12 : i64
      %186 = llvm.add %184, %185 : i64
      %187 = llvm.add %186, %72 : i64
      %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
      %189 = llvm.add %72, %11 : i64
      llvm.br ^bb2(%189 : i64)
    ^bb8:  // pred: ^bb2
      %190 = llvm.add %70, %13 : i64
      llvm.br ^bb1(%190 : i64)
    ^bb9:  // pred: ^bb1
      llvm.return %0 : i32
    }
  }
}

// -----// IR Dump After LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After LinkExecutablesPass (iree-hal-link-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main_dispatch_0::@embedded_elf_x86_64::@main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32) : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%__device_0 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %exe = hal.executable.lookup device(%__device_0 : !hal.device) executable(@main_dispatch_0) : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %value, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %value, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c0_1 = arith.constant 0 : index
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0_1] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_2) commands([%cmd])
    %status = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %1 = scf.index_switch %0 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      %c14_i32 = arith.constant 14 : i32
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      %2 = util.null : !hal.executable
      scf.yield %2 : !hal.executable
    }
    util.global.store %1, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c0 = arith.constant 0 : index
    %c-1 = arith.constant -1 : index
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = scf.index_switch %1 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c0 = arith.constant 0 : index
    %c-1 = arith.constant -1 : index
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = scf.index_switch %1 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c0 = arith.constant 0 : index
    %c-1 = arith.constant -1 : index
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = scf.index_switch %1 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c0 = arith.constant 0 : index
  %c-1 = arith.constant -1 : index
  %__device_0 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %2 = scf.index_switch %1 -> !hal.executable 
  case 0 {
    %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    scf.yield %exe : !hal.executable
  }
  default {
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    scf.yield %0 : !hal.executable
  }
  util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0 = util.global.load @__device_0 : !hal.device
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c0 = arith.constant 0 : index
  %c-1 = arith.constant -1 : index
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %2 = scf.index_switch %1 -> !hal.executable 
  case 0 {
    %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    scf.yield %exe : !hal.executable
  }
  default {
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    scf.yield %0 : !hal.executable
  }
  util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c0 = arith.constant 0 : index
    %c-1 = arith.constant -1 : index
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = scf.index_switch %1 -> !hal.executable 
    case 0 {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    }
    default {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %2, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64_ok : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.initializer {
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.return
}

// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.initializer {
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.return
}

// -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.initializer {
  %c-1 = arith.constant -1 : index
  %c0 = arith.constant 0 : index
  %c14_i32 = arith.constant 14 : i32
  %0 = util.null : !hal.executable
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %2 = arith.cmpi eq, %1, %c0 : index
  %3 = scf.if %2 -> (!hal.executable) {
    %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    scf.yield %exe : !hal.executable
  } else {
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    scf.yield %0 : !hal.executable
  }
  util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.initializer {
  %c-1 = arith.constant -1 : index
  %c0 = arith.constant 0 : index
  %c14_i32 = arith.constant 14 : i32
  %0 = util.null : !hal.executable
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %2 = arith.cmpi eq, %1, %c0 : index
  %3 = scf.if %2 -> (!hal.executable) {
    %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    scf.yield %exe : !hal.executable
  } else {
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    scf.yield %0 : !hal.executable
  }
  util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before InitializeDevicesPass (iree-hal-initialize-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module {
  util.global private @__device_0 = #device_target_local
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After InitializeDevicesPass (iree-hal-initialize-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.initializer {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %4 = util.cmp.eq %arg2, %0 : !hal.device
      %5 = arith.cmpi slt, %arg0, %device_count : index
      %6 = arith.andi %4, %5 : i1
      scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %4 = scf.if %value -> (i1) {
        %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_1 : i1
      } else {
        %false = arith.constant false
        scf.yield %false : i1
      }
      %5 = arith.cmpi eq, %arg1, %c0 : index
      %6 = arith.select %4, %c1, %c0 : index
      %7 = arith.addi %arg1, %6 : index
      %8 = arith.andi %4, %5 : i1
      %9 = arith.select %8, %device_n, %0 : !hal.device
      %10 = arith.addi %arg0, %c1 : index
      scf.yield %10, %7, %9 : index, index, !hal.device
    }
    %2 = util.null : !hal.device
    %3 = util.cmp.eq %1#2, %2 : !hal.device
    scf.if %3 {
      %c5_i32 = arith.constant 5 : i32
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %1#2, @__device_0 : !hal.device
    util.return
  }
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.initializer {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %4 = util.cmp.eq %arg2, %0 : !hal.device
      %5 = arith.cmpi slt, %arg0, %device_count : index
      %6 = arith.andi %4, %5 : i1
      scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %4 = scf.if %value -> (i1) {
        %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_1 : i1
      } else {
        %false = arith.constant false
        scf.yield %false : i1
      }
      %5 = arith.cmpi eq, %arg1, %c0 : index
      %6 = arith.select %4, %c1, %c0 : index
      %7 = arith.addi %arg1, %6 : index
      %8 = arith.andi %4, %5 : i1
      %9 = arith.select %8, %device_n, %0 : !hal.device
      %10 = arith.addi %arg0, %c1 : index
      scf.yield %10, %7, %9 : index, index, !hal.device
    }
    %2 = util.null : !hal.device
    %3 = util.cmp.eq %1#2, %2 : !hal.device
    scf.if %3 {
      %c5_i32 = arith.constant 5 : i32
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %1#2, @__device_0 : !hal.device
    util.return
  }
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.initializer {
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    util.return
  }
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c-1 = arith.constant -1 : index
    %c0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %0 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %2 = arith.cmpi eq, %1, %c0 : index
    %3 = scf.if %2 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %3, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %8 = util.cmp.eq %arg2, %0 : !hal.device
      %9 = arith.cmpi slt, %arg0, %device_count : index
      %10 = arith.andi %8, %9 : i1
      scf.condition(%10) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok_2, %value_3 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %8 = scf.if %value_3 -> (i1) {
        %ok_4, %value_5 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_5 : i1
      } else {
        %false = arith.constant false
        scf.yield %false : i1
      }
      %9 = arith.cmpi eq, %arg1, %c0 : index
      %10 = arith.select %8, %c1, %c0 : index
      %11 = arith.addi %arg1, %10 : index
      %12 = arith.andi %8, %9 : i1
      %13 = arith.select %12, %device_n, %0 : !hal.device
      %14 = arith.addi %arg0, %c1 : index
      scf.yield %14, %11, %13 : index, index, !hal.device
    }
    %2 = util.null : !hal.device
    %3 = util.cmp.eq %1#2, %2 : !hal.device
    scf.if %3 {
      %c5_i32 = arith.constant 5 : i32
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %1#2, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0_0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %4 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_1 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_1 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_1 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %5 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0_0, %c-1 : index
    %6 = arith.cmpi eq, %5, %c0_0 : index
    %7 = scf.if %6 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0_1 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %4 : !hal.executable
    }
    util.global.store %7, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %8 = util.cmp.eq %arg2, %0 : !hal.device
      %9 = arith.cmpi slt, %arg0, %device_count : index
      %10 = arith.andi %8, %9 : i1
      scf.condition(%10) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok_2, %value_3 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %8 = scf.if %value_3 -> (i1) {
        %ok_4, %value_5 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_5 : i1
      } else {
        %false = arith.constant false
        scf.yield %false : i1
      }
      %9 = arith.cmpi eq, %arg1, %c0 : index
      %10 = arith.select %8, %c1, %c0 : index
      %11 = arith.addi %arg1, %10 : index
      %12 = arith.andi %8, %9 : i1
      %13 = arith.select %12, %device_n, %0 : !hal.device
      %14 = arith.addi %arg0, %c1 : index
      scf.yield %14, %11, %13 : index, index, !hal.device
    }
    %2 = util.null : !hal.device
    %3 = util.cmp.eq %1#2, %2 : !hal.device
    scf.if %3 {
      %c5_i32 = arith.constant 5 : i32
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %1#2, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %c-1 = arith.constant -1 : index
    %c0_0 = arith.constant 0 : index
    %c14_i32 = arith.constant 14 : i32
    %4 = util.null : !hal.executable
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_1 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_1 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_1 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %5 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0_0, %c-1 : index
    %6 = arith.cmpi eq, %5, %c0_0 : index
    %7 = scf.if %6 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0_1 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %4 : !hal.executable
    }
    util.global.store %7, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %7 = util.cmp.eq %arg2, %1 : !hal.device
      %8 = arith.cmpi slt, %arg0, %device_count : index
      %9 = arith.andi %7, %8 : i1
      scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %7 = scf.if %value_2 -> (i1) {
        %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_4 : i1
      } else {
        scf.yield %false : i1
      }
      %8 = arith.cmpi eq, %arg1, %c0 : index
      %9 = arith.select %7, %c1, %c0 : index
      %10 = arith.addi %arg1, %9 : index
      %11 = arith.andi %7, %8 : i1
      %12 = arith.select %11, %device_n, %1 : !hal.device
      %13 = arith.addi %arg0, %c1 : index
      scf.yield %13, %10, %12 : index, index, !hal.device
    }
    %3 = util.cmp.eq %2#2, %1 : !hal.device
    scf.if %3 {
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %2#2, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %5 = arith.cmpi eq, %4, %c0 : index
    %6 = scf.if %5 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %7 = util.cmp.eq %arg2, %1 : !hal.device
      %8 = arith.cmpi slt, %arg0, %device_count : index
      %9 = arith.andi %7, %8 : i1
      scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %7 = scf.if %value_2 -> (i1) {
        %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_4 : i1
      } else {
        scf.yield %false : i1
      }
      %8 = arith.cmpi eq, %arg1, %c0 : index
      %9 = arith.select %7, %c1, %c0 : index
      %10 = arith.addi %arg1, %9 : index
      %11 = arith.andi %7, %8 : i1
      %12 = arith.select %11, %device_n, %1 : !hal.device
      %13 = arith.addi %arg0, %c1 : index
      scf.yield %13, %10, %12 : index, index, !hal.device
    }
    %3 = util.cmp.eq %2#2, %1 : !hal.device
    scf.if %3 {
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %2#2, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %5 = arith.cmpi eq, %4, %c0 : index
    %6 = scf.if %5 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) {
      %7 = util.cmp.eq %arg2, %1 : !hal.device
      %8 = arith.cmpi slt, %arg0, %device_count : index
      %9 = arith.andi %7, %8 : i1
      scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device
    } do {
    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
      %device_n = hal.devices.get %arg0 : !hal.device
      %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
      %7 = scf.if %value_2 -> (i1) {
        %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
        scf.yield %value_4 : i1
      } else {
        scf.yield %false : i1
      }
      %8 = arith.cmpi eq, %arg1, %c0 : index
      %9 = arith.select %7, %c1, %c0 : index
      %10 = arith.addi %arg1, %9 : index
      %11 = arith.andi %7, %8 : i1
      %12 = arith.select %11, %device_n, %1 : !hal.device
      %13 = arith.addi %arg0, %c1 : index
      scf.yield %13, %10, %12 : index, index, !hal.device
    }
    %3 = util.cmp.eq %2#2, %1 : !hal.device
    scf.if %3 {
      util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    }
    util.global.store %2#2, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_0 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %5 = arith.cmpi eq, %4, %c0 : index
    %6 = scf.if %5 -> (!hal.executable) {
      %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
      scf.yield %exe : !hal.executable
    } else {
      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      scf.yield %0 : !hal.executable
    }
    util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c7 = arith.constant 7 : index
        %c4 = arith.constant 4 : index
        %c1 = arith.constant 1 : index
        hal.return %c7, %c4, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #translation} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.undef : vector<3xf32>
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(252 : index) : i64
          %4 = llvm.mlir.constant(63 : index) : i64
          %5 = llvm.mlir.constant(25 : index) : i64
          %6 = llvm.mlir.constant(150 : index) : i64
          %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
          %8 = llvm.mlir.constant(4 : index) : i64
          %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
          %10 = llvm.mlir.constant(5 : index) : i64
          %11 = llvm.mlir.constant(3 : index) : i64
          %12 = llvm.mlir.constant(9 : index) : i64
          %13 = llvm.mlir.constant(1 : index) : i64
          %14 = llvm.mlir.constant(2 : index) : i64
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.mlir.constant(32 : i64) : i64
          %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
          %20 = llvm.load %19 : !llvm.ptr -> i32
          %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
          %22 = llvm.load %21 : !llvm.ptr -> i32
          %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
          %24 = llvm.load %23 : !llvm.ptr -> i32
          %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
          %26 = llvm.load %25 : !llvm.ptr -> i32
          %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
          %28 = llvm.load %27 : !llvm.ptr -> i32
          %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
          %30 = llvm.load %29 : !llvm.ptr -> i32
          %31 = llvm.zext %20 : i32 to i64
          %32 = llvm.zext %22 : i32 to i64
          %33 = llvm.shl %32, %16 : i64
          %34 = llvm.or %31, %33  : i64
          %35 = llvm.zext %24 : i32 to i64
          %36 = llvm.zext %26 : i32 to i64
          %37 = llvm.shl %36, %16 : i64
          %38 = llvm.or %35, %37  : i64
          %39 = llvm.zext %28 : i32 to i64
          %40 = llvm.zext %30 : i32 to i64
          %41 = llvm.shl %40, %16 : i64
          %42 = llvm.or %39, %41  : i64
          %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
          %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
          %47 = llvm.and %46, %4  : i64
          %48 = llvm.icmp "eq" %47, %15 : i64
          "llvm.intr.assume"(%48) : (i1) -> ()
          %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
          %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
          %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
          %54 = llvm.and %53, %4  : i64
          %55 = llvm.icmp "eq" %54, %15 : i64
          "llvm.intr.assume"(%55) : (i1) -> ()
          %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
          %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
          %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
          %59 = llvm.mul %42, %2 : i64
          %60 = llvm.mul %59, %38 : i64
          %61 = llvm.mul %60, %34 : i64
          %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
          %63 = llvm.and %62, %4  : i64
          %64 = llvm.icmp "eq" %63, %15 : i64
          "llvm.intr.assume"(%64) : (i1) -> ()
          %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
          %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %67 = llvm.zext %66 : i32 to i64
          %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
          %69 = llvm.zext %68 : i32 to i64
          llvm.br ^bb1(%15 : i64)
        ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
          %71 = llvm.icmp "slt" %70, %14 : i64
          llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
        ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
          %73 = llvm.icmp "slt" %72, %12 : i64
          llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
        ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
          %76 = llvm.icmp "slt" %74, %10 : i64
          llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
        ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
          %79 = llvm.icmp "slt" %77, %10 : i64
          llvm.cond_br %79, ^bb5, ^bb6
        ^bb5:  // pred: ^bb4
          %80 = llvm.add %67, %74 : i64
          %81 = llvm.add %72, %77 : i64
          %82 = llvm.mul %70, %61 : i64
          %83 = llvm.mul %60, %15 : i64
          %84 = llvm.add %82, %83 : i64
          %85 = llvm.mul %80, %59 : i64
          %86 = llvm.add %84, %85 : i64
          %87 = llvm.add %86, %81 : i64
          %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %90 = llvm.add %80, %13 : i64
          %91 = llvm.mul %90, %59 : i64
          %92 = llvm.add %84, %91 : i64
          %93 = llvm.add %92, %81 : i64
          %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %96 = llvm.add %80, %14 : i64
          %97 = llvm.mul %96, %59 : i64
          %98 = llvm.add %84, %97 : i64
          %99 = llvm.add %98, %81 : i64
          %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %102 = llvm.add %80, %11 : i64
          %103 = llvm.mul %102, %59 : i64
          %104 = llvm.add %84, %103 : i64
          %105 = llvm.add %104, %81 : i64
          %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %108 = llvm.add %80, %8 : i64
          %109 = llvm.mul %108, %59 : i64
          %110 = llvm.add %84, %109 : i64
          %111 = llvm.add %110, %81 : i64
          %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %114 = llvm.add %80, %10 : i64
          %115 = llvm.mul %114, %59 : i64
          %116 = llvm.add %84, %115 : i64
          %117 = llvm.add %116, %81 : i64
          %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
          %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %121 = llvm.mul %69, %6 : i64
          %122 = llvm.mul %15, %5 : i64
          %123 = llvm.add %121, %122 : i64
          %124 = llvm.mul %74, %10 : i64
          %125 = llvm.add %123, %124 : i64
          %126 = llvm.add %125, %77 : i64
          %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %128 = llvm.load %127 : !llvm.ptr -> f32
          %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
          %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
          %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %132 = llvm.mul %13, %5 : i64
          %133 = llvm.add %121, %132 : i64
          %134 = llvm.add %133, %124 : i64
          %135 = llvm.add %134, %77 : i64
          %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %137 = llvm.load %136 : !llvm.ptr -> f32
          %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
          %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
          %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %141 = llvm.mul %14, %5 : i64
          %142 = llvm.add %121, %141 : i64
          %143 = llvm.add %142, %124 : i64
          %144 = llvm.add %143, %77 : i64
          %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %146 = llvm.load %145 : !llvm.ptr -> f32
          %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
          %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
          %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %150 = llvm.mul %11, %5 : i64
          %151 = llvm.add %121, %150 : i64
          %152 = llvm.add %151, %124 : i64
          %153 = llvm.add %152, %77 : i64
          %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %155 = llvm.load %154 : !llvm.ptr -> f32
          %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
          %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
          %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %159 = llvm.mul %8, %5 : i64
          %160 = llvm.add %121, %159 : i64
          %161 = llvm.add %160, %124 : i64
          %162 = llvm.add %161, %77 : i64
          %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %164 = llvm.load %163 : !llvm.ptr -> f32
          %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
          %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
          %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %168 = llvm.mul %10, %5 : i64
          %169 = llvm.add %121, %168 : i64
          %170 = llvm.add %169, %124 : i64
          %171 = llvm.add %170, %77 : i64
          %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          %173 = llvm.load %172 : !llvm.ptr -> f32
          %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
          %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
          %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
          %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
          %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %179 = llvm.add %77, %13 : i64
          llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb6:  // pred: ^bb4
          %180 = llvm.add %74, %13 : i64
          llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
        ^bb7:  // pred: ^bb3
          %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
          %182 = llvm.mul %70, %3 : i64
          %183 = llvm.mul %69, %4 : i64
          %184 = llvm.add %182, %183 : i64
          %185 = llvm.mul %67, %12 : i64
          %186 = llvm.add %184, %185 : i64
          %187 = llvm.add %186, %72 : i64
          %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
          llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
          %189 = llvm.add %72, %11 : i64
          llvm.br ^bb2(%189 : i64)
        ^bb8:  // pred: ^bb2
          %190 = llvm.add %70, %13 : i64
          llvm.br ^bb1(%190 : i64)
        ^bb9:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  %2:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %1) : (index, index, !hal.device) -> (index, index, !hal.device) {
    %7 = util.cmp.eq %arg2, %1 : !hal.device
    %8 = arith.cmpi slt, %arg0, %device_count : index
    %9 = arith.andi %7, %8 : i1
    scf.condition(%9) %arg0, %arg1, %arg2 : index, index, !hal.device
  } do {
  ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
    %device_n = hal.devices.get %arg0 : !hal.device
    %ok_1, %value_2 = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    %7 = scf.if %value_2 -> (i1) {
      %ok_3, %value_4 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
      scf.yield %value_4 : i1
    } else {
      scf.yield %false : i1
    }
    %8 = arith.cmpi eq, %arg1, %c0 : index
    %9 = arith.select %7, %c1, %c0 : index
    %10 = arith.addi %arg1, %9 : index
    %11 = arith.andi %7, %8 : i1
    %12 = arith.select %11, %device_n, %1 : !hal.device
    %13 = arith.addi %arg0, %c1 : index
    scf.yield %13, %10, %12 : index, index, !hal.device
  }
  %3 = util.cmp.eq %2#2, %1 : !hal.device
  scf.if %3 {
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  }
  util.global.store %2#2, @__device_0 : !hal.device
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_0 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_0 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0_0 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %4 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %5 = arith.cmpi eq, %4, %c0 : index
  %6 = scf.if %5 -> (!hal.executable) {
    %exe = hal.executable.create device(%__device_0_0 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    scf.yield %exe : !hal.executable
  } else {
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    scf.yield %0 : !hal.executable
  }
  util.global.store %6, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
  %device_n = hal.devices.get %8 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb5(%value_1 : i1)
^bb4:  // pred: ^bb2
  cf.br ^bb5(%false : i1)
^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
  cf.br ^bb6
^bb6:  // pred: ^bb5
  %12 = arith.cmpi eq, %9, %c0 : index
  %13 = arith.select %11, %c1, %c0 : index
  %14 = arith.addi %9, %13 : index
  %15 = arith.andi %11, %12 : i1
  %16 = arith.select %15, %device_n, %1 : !hal.device
  %17 = arith.addi %8, %c1 : index
  cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
^bb7:  // pred: ^bb1
  %18 = util.cmp.eq %4, %1 : !hal.device
  cf.cond_br %18, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb9
^bb9:  // 2 preds: ^bb7, ^bb8
  util.global.store %4, @__device_0 : !hal.device
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_4 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %20 = arith.cmpi eq, %19, %c0 : index
  cf.cond_br %20, ^bb10, ^bb11
^bb10:  // pred: ^bb9
  %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb12(%exe : !hal.executable)
^bb11:  // pred: ^bb9
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb12(%0 : !hal.executable)
^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
  cf.br ^bb13
^bb13:  // pred: ^bb12
  util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SerializeExecutablesPass (iree-hal-serialize-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %c7 = arith.constant 7 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c7, %c4, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.undef : vector<3xf32>
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(252 : index) : i64
        %4 = llvm.mlir.constant(63 : index) : i64
        %5 = llvm.mlir.constant(25 : index) : i64
        %6 = llvm.mlir.constant(150 : index) : i64
        %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
        %8 = llvm.mlir.constant(4 : index) : i64
        %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
        %10 = llvm.mlir.constant(5 : index) : i64
        %11 = llvm.mlir.constant(3 : index) : i64
        %12 = llvm.mlir.constant(9 : index) : i64
        %13 = llvm.mlir.constant(1 : index) : i64
        %14 = llvm.mlir.constant(2 : index) : i64
        %15 = llvm.mlir.constant(0 : index) : i64
        %16 = llvm.mlir.constant(32 : i64) : i64
        %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
        %20 = llvm.load %19 : !llvm.ptr -> i32
        %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
        %22 = llvm.load %21 : !llvm.ptr -> i32
        %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
        %24 = llvm.load %23 : !llvm.ptr -> i32
        %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
        %26 = llvm.load %25 : !llvm.ptr -> i32
        %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
        %28 = llvm.load %27 : !llvm.ptr -> i32
        %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
        %30 = llvm.load %29 : !llvm.ptr -> i32
        %31 = llvm.zext %20 : i32 to i64
        %32 = llvm.zext %22 : i32 to i64
        %33 = llvm.shl %32, %16 : i64
        %34 = llvm.or %31, %33  : i64
        %35 = llvm.zext %24 : i32 to i64
        %36 = llvm.zext %26 : i32 to i64
        %37 = llvm.shl %36, %16 : i64
        %38 = llvm.or %35, %37  : i64
        %39 = llvm.zext %28 : i32 to i64
        %40 = llvm.zext %30 : i32 to i64
        %41 = llvm.shl %40, %16 : i64
        %42 = llvm.or %39, %41  : i64
        %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
        %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
        %47 = llvm.and %46, %4  : i64
        %48 = llvm.icmp "eq" %47, %15 : i64
        "llvm.intr.assume"(%48) : (i1) -> ()
        %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
        %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
        %54 = llvm.and %53, %4  : i64
        %55 = llvm.icmp "eq" %54, %15 : i64
        "llvm.intr.assume"(%55) : (i1) -> ()
        %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
        %59 = llvm.mul %42, %2 : i64
        %60 = llvm.mul %59, %38 : i64
        %61 = llvm.mul %60, %34 : i64
        %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
        %63 = llvm.and %62, %4  : i64
        %64 = llvm.icmp "eq" %63, %15 : i64
        "llvm.intr.assume"(%64) : (i1) -> ()
        %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
        %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %67 = llvm.zext %66 : i32 to i64
        %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %69 = llvm.zext %68 : i32 to i64
        llvm.br ^bb1(%15 : i64)
      ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
        %71 = llvm.icmp "slt" %70, %14 : i64
        llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
      ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
        %73 = llvm.icmp "slt" %72, %12 : i64
        llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
      ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
        %76 = llvm.icmp "slt" %74, %10 : i64
        llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
      ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
        %79 = llvm.icmp "slt" %77, %10 : i64
        llvm.cond_br %79, ^bb5, ^bb6
      ^bb5:  // pred: ^bb4
        %80 = llvm.add %67, %74 : i64
        %81 = llvm.add %72, %77 : i64
        %82 = llvm.mul %70, %61 : i64
        %83 = llvm.mul %60, %15 : i64
        %84 = llvm.add %82, %83 : i64
        %85 = llvm.mul %80, %59 : i64
        %86 = llvm.add %84, %85 : i64
        %87 = llvm.add %86, %81 : i64
        %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %90 = llvm.add %80, %13 : i64
        %91 = llvm.mul %90, %59 : i64
        %92 = llvm.add %84, %91 : i64
        %93 = llvm.add %92, %81 : i64
        %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %96 = llvm.add %80, %14 : i64
        %97 = llvm.mul %96, %59 : i64
        %98 = llvm.add %84, %97 : i64
        %99 = llvm.add %98, %81 : i64
        %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %102 = llvm.add %80, %11 : i64
        %103 = llvm.mul %102, %59 : i64
        %104 = llvm.add %84, %103 : i64
        %105 = llvm.add %104, %81 : i64
        %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %108 = llvm.add %80, %8 : i64
        %109 = llvm.mul %108, %59 : i64
        %110 = llvm.add %84, %109 : i64
        %111 = llvm.add %110, %81 : i64
        %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %114 = llvm.add %80, %10 : i64
        %115 = llvm.mul %114, %59 : i64
        %116 = llvm.add %84, %115 : i64
        %117 = llvm.add %116, %81 : i64
        %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %121 = llvm.mul %69, %6 : i64
        %122 = llvm.mul %15, %5 : i64
        %123 = llvm.add %121, %122 : i64
        %124 = llvm.mul %74, %10 : i64
        %125 = llvm.add %123, %124 : i64
        %126 = llvm.add %125, %77 : i64
        %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %128 = llvm.load %127 : !llvm.ptr -> f32
        %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
        %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
        %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %132 = llvm.mul %13, %5 : i64
        %133 = llvm.add %121, %132 : i64
        %134 = llvm.add %133, %124 : i64
        %135 = llvm.add %134, %77 : i64
        %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %137 = llvm.load %136 : !llvm.ptr -> f32
        %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
        %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
        %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %141 = llvm.mul %14, %5 : i64
        %142 = llvm.add %121, %141 : i64
        %143 = llvm.add %142, %124 : i64
        %144 = llvm.add %143, %77 : i64
        %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %146 = llvm.load %145 : !llvm.ptr -> f32
        %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
        %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
        %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %150 = llvm.mul %11, %5 : i64
        %151 = llvm.add %121, %150 : i64
        %152 = llvm.add %151, %124 : i64
        %153 = llvm.add %152, %77 : i64
        %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %155 = llvm.load %154 : !llvm.ptr -> f32
        %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
        %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
        %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %159 = llvm.mul %8, %5 : i64
        %160 = llvm.add %121, %159 : i64
        %161 = llvm.add %160, %124 : i64
        %162 = llvm.add %161, %77 : i64
        %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %164 = llvm.load %163 : !llvm.ptr -> f32
        %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
        %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
        %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %168 = llvm.mul %10, %5 : i64
        %169 = llvm.add %121, %168 : i64
        %170 = llvm.add %169, %124 : i64
        %171 = llvm.add %170, %77 : i64
        %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %173 = llvm.load %172 : !llvm.ptr -> f32
        %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
        %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
        %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
        %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %179 = llvm.add %77, %13 : i64
        llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb6:  // pred: ^bb4
        %180 = llvm.add %74, %13 : i64
        llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb7:  // pred: ^bb3
        %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %182 = llvm.mul %70, %3 : i64
        %183 = llvm.mul %69, %4 : i64
        %184 = llvm.add %182, %183 : i64
        %185 = llvm.mul %67, %12 : i64
        %186 = llvm.add %184, %185 : i64
        %187 = llvm.add %186, %72 : i64
        %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
        %189 = llvm.add %72, %11 : i64
        llvm.br ^bb2(%189 : i64)
      ^bb8:  // pred: ^bb2
        %190 = llvm.add %70, %13 : i64
        llvm.br ^bb1(%190 : i64)
      ^bb9:  // pred: ^bb1
        llvm.return %0 : i32
      }
    }
  }
}

// -----// IR Dump Before SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %c7 = arith.constant 7 : index
      %c4 = arith.constant 4 : index
      %c1 = arith.constant 1 : index
      hal.return %c7, %c4, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func @main_dispatch_0_conv_2d_nchw_fchw_2x4x7x9x6x5x5_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.undef : vector<3xf32>
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(252 : index) : i64
        %4 = llvm.mlir.constant(63 : index) : i64
        %5 = llvm.mlir.constant(25 : index) : i64
        %6 = llvm.mlir.constant(150 : index) : i64
        %7 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x3xf32>) : !llvm.array<1 x vector<3xf32>>
        %8 = llvm.mlir.constant(4 : index) : i64
        %9 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x3xf32>) : !llvm.array<1 x array<1 x vector<3xf32>>>
        %10 = llvm.mlir.constant(5 : index) : i64
        %11 = llvm.mlir.constant(3 : index) : i64
        %12 = llvm.mlir.constant(9 : index) : i64
        %13 = llvm.mlir.constant(1 : index) : i64
        %14 = llvm.mlir.constant(2 : index) : i64
        %15 = llvm.mlir.constant(0 : index) : i64
        %16 = llvm.mlir.constant(32 : i64) : i64
        %17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %18 = llvm.extractvalue %17[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %19 = llvm.getelementptr %18[2] : (!llvm.ptr) -> !llvm.ptr, i32
        %20 = llvm.load %19 : !llvm.ptr -> i32
        %21 = llvm.getelementptr %18[3] : (!llvm.ptr) -> !llvm.ptr, i32
        %22 = llvm.load %21 : !llvm.ptr -> i32
        %23 = llvm.getelementptr %18[4] : (!llvm.ptr) -> !llvm.ptr, i32
        %24 = llvm.load %23 : !llvm.ptr -> i32
        %25 = llvm.getelementptr %18[5] : (!llvm.ptr) -> !llvm.ptr, i32
        %26 = llvm.load %25 : !llvm.ptr -> i32
        %27 = llvm.getelementptr %18[6] : (!llvm.ptr) -> !llvm.ptr, i32
        %28 = llvm.load %27 : !llvm.ptr -> i32
        %29 = llvm.getelementptr %18[7] : (!llvm.ptr) -> !llvm.ptr, i32
        %30 = llvm.load %29 : !llvm.ptr -> i32
        %31 = llvm.zext %20 : i32 to i64
        %32 = llvm.zext %22 : i32 to i64
        %33 = llvm.shl %32, %16 : i64
        %34 = llvm.or %31, %33  : i64
        %35 = llvm.zext %24 : i32 to i64
        %36 = llvm.zext %26 : i32 to i64
        %37 = llvm.shl %36, %16 : i64
        %38 = llvm.or %35, %37  : i64
        %39 = llvm.zext %28 : i32 to i64
        %40 = llvm.zext %30 : i32 to i64
        %41 = llvm.shl %40, %16 : i64
        %42 = llvm.or %39, %41  : i64
        %43 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %44 = llvm.getelementptr %43[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %45 = llvm.load %44 : !llvm.ptr -> !llvm.ptr
        %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
        %47 = llvm.and %46, %4  : i64
        %48 = llvm.icmp "eq" %47, %15 : i64
        "llvm.intr.assume"(%48) : (i1) -> ()
        %49 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %50 = llvm.extractvalue %49[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %51 = llvm.getelementptr %50[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
        %52 = llvm.load %51 : !llvm.ptr -> !llvm.ptr
        %53 = llvm.ptrtoint %52 : !llvm.ptr to i64
        %54 = llvm.and %53, %4  : i64
        %55 = llvm.icmp "eq" %54, %15 : i64
        "llvm.intr.assume"(%55) : (i1) -> ()
        %56 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
        %57 = llvm.extractvalue %56[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
        %58 = llvm.load %57 : !llvm.ptr -> !llvm.ptr
        %59 = llvm.mul %42, %2 : i64
        %60 = llvm.mul %59, %38 : i64
        %61 = llvm.mul %60, %34 : i64
        %62 = llvm.ptrtoint %58 : !llvm.ptr to i64
        %63 = llvm.and %62, %4  : i64
        %64 = llvm.icmp "eq" %63, %15 : i64
        "llvm.intr.assume"(%64) : (i1) -> ()
        %65 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
        %66 = llvm.extractvalue %65[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %67 = llvm.zext %66 : i32 to i64
        %68 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)> 
        %69 = llvm.zext %68 : i32 to i64
        llvm.br ^bb1(%15 : i64)
      ^bb1(%70: i64):  // 2 preds: ^bb0, ^bb8
        %71 = llvm.icmp "slt" %70, %14 : i64
        llvm.cond_br %71, ^bb2(%15 : i64), ^bb9
      ^bb2(%72: i64):  // 2 preds: ^bb1, ^bb7
        %73 = llvm.icmp "slt" %72, %12 : i64
        llvm.cond_br %73, ^bb3(%15, %9 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb8
      ^bb3(%74: i64, %75: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb2, ^bb6
        %76 = llvm.icmp "slt" %74, %10 : i64
        llvm.cond_br %76, ^bb4(%15, %75 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>), ^bb7
      ^bb4(%77: i64, %78: !llvm.array<1 x array<1 x vector<3xf32>>>):  // 2 preds: ^bb3, ^bb5
        %79 = llvm.icmp "slt" %77, %10 : i64
        llvm.cond_br %79, ^bb5, ^bb6
      ^bb5:  // pred: ^bb4
        %80 = llvm.add %67, %74 : i64
        %81 = llvm.add %72, %77 : i64
        %82 = llvm.mul %70, %61 : i64
        %83 = llvm.mul %60, %15 : i64
        %84 = llvm.add %82, %83 : i64
        %85 = llvm.mul %80, %59 : i64
        %86 = llvm.add %84, %85 : i64
        %87 = llvm.add %86, %81 : i64
        %88 = llvm.getelementptr %58[%87] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %90 = llvm.add %80, %13 : i64
        %91 = llvm.mul %90, %59 : i64
        %92 = llvm.add %84, %91 : i64
        %93 = llvm.add %92, %81 : i64
        %94 = llvm.getelementptr %58[%93] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %95 = llvm.load %94 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %96 = llvm.add %80, %14 : i64
        %97 = llvm.mul %96, %59 : i64
        %98 = llvm.add %84, %97 : i64
        %99 = llvm.add %98, %81 : i64
        %100 = llvm.getelementptr %58[%99] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %101 = llvm.load %100 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %102 = llvm.add %80, %11 : i64
        %103 = llvm.mul %102, %59 : i64
        %104 = llvm.add %84, %103 : i64
        %105 = llvm.add %104, %81 : i64
        %106 = llvm.getelementptr %58[%105] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %107 = llvm.load %106 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %108 = llvm.add %80, %8 : i64
        %109 = llvm.mul %108, %59 : i64
        %110 = llvm.add %84, %109 : i64
        %111 = llvm.add %110, %81 : i64
        %112 = llvm.getelementptr %58[%111] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %113 = llvm.load %112 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %114 = llvm.add %80, %10 : i64
        %115 = llvm.mul %114, %59 : i64
        %116 = llvm.add %84, %115 : i64
        %117 = llvm.add %116, %81 : i64
        %118 = llvm.getelementptr %58[%117] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %119 = llvm.load %118 {alignment = 4 : i64} : !llvm.ptr -> vector<3xf32>
        %120 = llvm.extractvalue %78[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %121 = llvm.mul %69, %6 : i64
        %122 = llvm.mul %15, %5 : i64
        %123 = llvm.add %121, %122 : i64
        %124 = llvm.mul %74, %10 : i64
        %125 = llvm.add %123, %124 : i64
        %126 = llvm.add %125, %77 : i64
        %127 = llvm.getelementptr %45[%126] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %128 = llvm.load %127 : !llvm.ptr -> f32
        %129 = llvm.insertelement %128, %1[%0 : i32] : vector<3xf32>
        %130 = llvm.shufflevector %129, %1 [0, 0, 0] : vector<3xf32> 
        %131 = llvm.intr.fmuladd(%89, %130, %120)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %132 = llvm.mul %13, %5 : i64
        %133 = llvm.add %121, %132 : i64
        %134 = llvm.add %133, %124 : i64
        %135 = llvm.add %134, %77 : i64
        %136 = llvm.getelementptr %45[%135] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %137 = llvm.load %136 : !llvm.ptr -> f32
        %138 = llvm.insertelement %137, %1[%0 : i32] : vector<3xf32>
        %139 = llvm.shufflevector %138, %1 [0, 0, 0] : vector<3xf32> 
        %140 = llvm.intr.fmuladd(%95, %139, %131)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %141 = llvm.mul %14, %5 : i64
        %142 = llvm.add %121, %141 : i64
        %143 = llvm.add %142, %124 : i64
        %144 = llvm.add %143, %77 : i64
        %145 = llvm.getelementptr %45[%144] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %146 = llvm.load %145 : !llvm.ptr -> f32
        %147 = llvm.insertelement %146, %1[%0 : i32] : vector<3xf32>
        %148 = llvm.shufflevector %147, %1 [0, 0, 0] : vector<3xf32> 
        %149 = llvm.intr.fmuladd(%101, %148, %140)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %150 = llvm.mul %11, %5 : i64
        %151 = llvm.add %121, %150 : i64
        %152 = llvm.add %151, %124 : i64
        %153 = llvm.add %152, %77 : i64
        %154 = llvm.getelementptr %45[%153] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %155 = llvm.load %154 : !llvm.ptr -> f32
        %156 = llvm.insertelement %155, %1[%0 : i32] : vector<3xf32>
        %157 = llvm.shufflevector %156, %1 [0, 0, 0] : vector<3xf32> 
        %158 = llvm.intr.fmuladd(%107, %157, %149)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %159 = llvm.mul %8, %5 : i64
        %160 = llvm.add %121, %159 : i64
        %161 = llvm.add %160, %124 : i64
        %162 = llvm.add %161, %77 : i64
        %163 = llvm.getelementptr %45[%162] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %164 = llvm.load %163 : !llvm.ptr -> f32
        %165 = llvm.insertelement %164, %1[%0 : i32] : vector<3xf32>
        %166 = llvm.shufflevector %165, %1 [0, 0, 0] : vector<3xf32> 
        %167 = llvm.intr.fmuladd(%113, %166, %158)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %168 = llvm.mul %10, %5 : i64
        %169 = llvm.add %121, %168 : i64
        %170 = llvm.add %169, %124 : i64
        %171 = llvm.add %170, %77 : i64
        %172 = llvm.getelementptr %45[%171] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        %173 = llvm.load %172 : !llvm.ptr -> f32
        %174 = llvm.insertelement %173, %1[%0 : i32] : vector<3xf32>
        %175 = llvm.shufflevector %174, %1 [0, 0, 0] : vector<3xf32> 
        %176 = llvm.intr.fmuladd(%119, %175, %167)  : (vector<3xf32>, vector<3xf32>, vector<3xf32>) -> vector<3xf32>
        %177 = llvm.insertvalue %176, %7[0] : !llvm.array<1 x vector<3xf32>> 
        %178 = llvm.insertvalue %177, %9[0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %179 = llvm.add %77, %13 : i64
        llvm.br ^bb4(%179, %178 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb6:  // pred: ^bb4
        %180 = llvm.add %74, %13 : i64
        llvm.br ^bb3(%180, %78 : i64, !llvm.array<1 x array<1 x vector<3xf32>>>)
      ^bb7:  // pred: ^bb3
        %181 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<3xf32>>> 
        %182 = llvm.mul %70, %3 : i64
        %183 = llvm.mul %69, %4 : i64
        %184 = llvm.add %182, %183 : i64
        %185 = llvm.mul %67, %12 : i64
        %186 = llvm.add %184, %185 : i64
        %187 = llvm.add %186, %72 : i64
        %188 = llvm.getelementptr %52[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %181, %188 {alignment = 4 : i64} : vector<3xf32>, !llvm.ptr
        %189 = llvm.add %72, %11 : i64
        llvm.br ^bb2(%189 : i64)
      ^bb8:  // pred: ^bb2
        %190 = llvm.add %70, %13 : i64
        llvm.br ^bb1(%190 : i64)
      ^bb9:  // pred: ^bb1
        llvm.return %0 : i32
      }
    }
  }
}

// -----// IR Dump After SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}

// -----// IR Dump After SerializeExecutablesPass (iree-hal-serialize-executables) //----- //
hal.executable private @main_dispatch_0 {
  hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}

// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before PruneExecutablesPass (iree-hal-prune-executables) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    %18 = util.cmp.eq %4, %1 : !hal.device
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %19 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %20 = arith.cmpi eq, %19, %c0 : index
    cf.cond_br %20, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%21: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %21, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    cf.cond_br %5, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %18 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %19 = arith.cmpi eq, %18, %c0 : index
    cf.cond_br %19, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%20: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %20, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb6
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3, %4 : index, index, !hal.device), ^bb7
  ^bb2(%8: index, %9: index, %10: !hal.device):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb5(%value_1 : i1)
  ^bb4:  // pred: ^bb2
    cf.br ^bb5(%false : i1)
  ^bb5(%11: i1):  // 2 preds: ^bb3, ^bb4
    cf.br ^bb6
  ^bb6:  // pred: ^bb5
    %12 = arith.cmpi eq, %9, %c0 : index
    %13 = arith.select %11, %c1, %c0 : index
    %14 = arith.addi %9, %13 : index
    %15 = arith.andi %11, %12 : i1
    %16 = arith.select %15, %device_n, %1 : !hal.device
    %17 = arith.addi %8, %c1 : index
    cf.br ^bb1(%17, %14, %16 : index, index, !hal.device)
  ^bb7:  // pred: ^bb1
    cf.cond_br %5, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb9
  ^bb9:  // 2 preds: ^bb7, ^bb8
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %18 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %19 = arith.cmpi eq, %18, %c0 : index
    cf.cond_br %19, ^bb10, ^bb11
  ^bb10:  // pred: ^bb9
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb12(%exe : !hal.executable)
  ^bb11:  // pred: ^bb9
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb12(%0 : !hal.executable)
  ^bb12(%20: !hal.executable):  // 2 preds: ^bb10, ^bb11
    cf.br ^bb13
  ^bb13:  // pred: ^bb12
    util.global.store %20, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
  ^bb2(%8: index, %9: index):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
    %11 = arith.cmpi eq, %9, %c0 : index
    %12 = arith.select %10, %c1, %c0 : index
    %13 = arith.addi %9, %12 : index
    %14 = arith.andi %10, %11 : i1
    %15 = arith.select %14, %device_n, %1 : !hal.device
    %16 = arith.addi %8, %c1 : index
    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %18 = arith.cmpi eq, %17, %c0 : index
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
  ^bb2(%8: index, %9: index):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
    %11 = arith.cmpi eq, %9, %c0 : index
    %12 = arith.select %10, %c1, %c0 : index
    %13 = arith.addi %9, %12 : index
    %14 = arith.andi %10, %11 : i1
    %15 = arith.select %14, %device_n, %1 : !hal.device
    %16 = arith.addi %8, %c1 : index
    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %18 = arith.cmpi eq, %17, %c0 : index
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
  ^bb2(%8: index, %9: index):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
    %11 = arith.cmpi eq, %9, %c0 : index
    %12 = arith.select %10, %c1, %c0 : index
    %13 = arith.addi %9, %12 : index
    %14 = arith.andi %10, %11 : i1
    %15 = arith.select %14, %device_n, %1 : !hal.device
    %16 = arith.addi %8, %c1 : index
    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    util.global.store %4, @__device_0 : !hal.device
    %__device_0 = util.global.load @__device_0 : !hal.device
    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    %__device_0_4 = util.global.load @__device_0 : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
    %18 = arith.cmpi eq, %17, %c0 : index
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
^bb2(%8: index, %9: index):  // pred: ^bb1
  %device_n = hal.devices.get %8 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
  %11 = arith.cmpi eq, %9, %c0 : index
  %12 = arith.select %10, %c1, %c0 : index
  %13 = arith.addi %9, %12 : index
  %14 = arith.andi %10, %11 : i1
  %15 = arith.select %14, %device_n, %1 : !hal.device
  %16 = arith.addi %8, %c1 : index
  cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  util.global.store %4, @__device_0 : !hal.device
  %__device_0 = util.global.load @__device_0 : !hal.device
  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_query_0_hal_executable_format_embedded_elf_x86_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  %__device_0_4 = util.global.load @__device_0 : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%__device_0_4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%__device_0_4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %17 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_x86_64, %c0, %c-1 : index
  %18 = arith.cmpi eq, %17, %c0 : index
  cf.cond_br %18, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%__device_0_4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
^bb2(%8: index, %9: index):  // pred: ^bb1
  %device_n = hal.devices.get %8 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
  %11 = arith.cmpi eq, %9, %c0 : index
  %12 = arith.select %10, %c1, %c0 : index
  %13 = arith.addi %9, %12 : index
  %14 = arith.andi %10, %11 : i1
  %15 = arith.select %14, %device_n, %1 : !hal.device
  %16 = arith.addi %8, %c1 : index
  cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %17 = arith.select %value_3, %c0, %c-1 : index
  %18 = arith.cmpi eq, %17, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  cf.cond_br %18, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %c9 = arith.constant 9 : index
  %c-1_i32 = arith.constant -1 : i32
  %c7 = arith.constant 7 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = util.null : !hal.fence
  %c-1_i64 = arith.constant -1 : i64
  %c32_i64 = arith.constant 32 : i64
  %c2016 = arith.constant 2016 : index
  %c2400 = arith.constant 2400 : index
  %c0 = arith.constant 0 : index
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c6 = arith.constant 6 : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2(%2, %3 : index, index), ^bb5
  ^bb2(%8: index, %9: index):  // pred: ^bb1
    %device_n = hal.devices.get %8 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%10: i1):  // 2 preds: ^bb2, ^bb3
    %11 = arith.cmpi eq, %9, %c0 : index
    %12 = arith.select %10, %c1, %c0 : index
    %13 = arith.addi %9, %12 : index
    %14 = arith.andi %10, %11 : i1
    %15 = arith.select %14, %device_n, %1 : !hal.device
    %16 = arith.addi %8, %c1 : index
    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %17 = arith.select %value_3, %c0, %c-1 : index
    %18 = arith.cmpi eq, %17, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    cf.cond_br %18, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%19: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %19, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_x86_64 : i1
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Inliner (inline) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After Inliner (inline) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before AffineExpandIndexOps (affine-expand-index-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ConvertAffineToStandard (lower-affine) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After ArithUnsignedWhenEquivalent (arith-unsigned-when-equivalent) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = util.null : !hal.executable
  %c14_i32 = arith.constant 14 : i32
  %c-1 = arith.constant -1 : index
  %c5_i32 = arith.constant 5 : i32
  %false = arith.constant false
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %1 = util.null : !hal.device
  %device_count = hal.devices.count : index
  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
  %5 = util.cmp.eq %4, %1 : !hal.device
  %6 = arith.cmpi slt, %2, %device_count : index
  %7 = arith.andi %5, %6 : i1
  cf.cond_br %7, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %device_n = hal.devices.get %2 : !hal.device
  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
^bb3:  // pred: ^bb2
  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.br ^bb4(%value_1 : i1)
^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
  %9 = arith.cmpi eq, %3, %c0 : index
  %10 = arith.select %8, %c1, %c0 : index
  %11 = arith.addi %3, %10 : index
  %12 = arith.andi %8, %9 : i1
  %13 = arith.select %12, %device_n, %1 : !hal.device
  %14 = arith.addi %2, %c1 : index
  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
^bb5:  // pred: ^bb1
  cf.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  cf.br ^bb7
^bb7:  // 2 preds: ^bb5, ^bb6
  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
  %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
  %15 = arith.select %value_3, %c0, %c-1 : index
  %16 = arith.cmpi eq, %15, %c0 : index
  util.global.store %4, @__device_0 : !hal.device
  cf.cond_br %16, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
  cf.br ^bb10(%exe : !hal.executable)
^bb9:  // pred: ^bb7
  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  cf.br ^bb10(%0 : !hal.executable)
^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
  util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %__device_0 = util.global.load immutable @__device_0 : !hal.device
  %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
  %c6 = arith.constant 6 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c0 = arith.constant 0 : index
  %c2400 = arith.constant 2400 : index
  %c2016 = arith.constant 2016 : index
  %c32_i64 = arith.constant 32 : i64
  %c-1_i64 = arith.constant -1 : i64
  %0 = util.null : !hal.fence
  %c0_i64 = arith.constant 0 : i64
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c7 = arith.constant 7 : index
  %c-1_i32 = arith.constant -1 : i32
  %c9 = arith.constant 9 : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
  %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
  %5 = arith.muli %1, %c4 : index
  %6 = arith.muli %5, %2 : index
  %7 = arith.muli %6, %3 : index
  %8 = arith.muli %7, %4 : index
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
  %9 = arith.index_castui %1 : index to i64
  %10 = arith.trunci %9 : i64 to i32
  %11 = arith.shrui %9, %c32_i64 : i64
  %12 = arith.trunci %11 : i64 to i32
  %13 = arith.index_castui %2 : index to i64
  %14 = arith.trunci %13 : i64 to i32
  %15 = arith.shrui %13, %c32_i64 : i64
  %16 = arith.trunci %15 : i64 to i32
  %17 = arith.index_castui %3 : index to i64
  %18 = arith.trunci %17 : i64 to i32
  %19 = arith.shrui %17, %c32_i64 : i64
  %20 = arith.trunci %19 : i64 to i32
  %21 = arith.index_castui %4 : index to i64
  %22 = arith.trunci %21 : i64 to i32
  %23 = arith.shrui %21, %c32_i64 : i64
  %24 = arith.trunci %23 : i64 to i32
  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
  hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %8], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
    %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
  hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
  util.status.check_ok %status, "failed to wait on timepoint"
  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
  util.return %view : !hal.buffer_view
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c0 = arith.constant 0 : index
    %c2400 = arith.constant 2400 : index
    %c2016 = arith.constant 2016 : index
    %c32_i64 = arith.constant 32 : i64
    %c-1_i64 = arith.constant -1 : i64
    %0 = util.null : !hal.fence
    %c0_i64 = arith.constant 0 : i64
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c7 = arith.constant 7 : index
    %c-1_i32 = arith.constant -1 : i32
    %c9 = arith.constant 9 : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::ConversionPass (iree-vm-conversion) //----- //
module {
  util.global private @__device_0 : !hal.device
  util.global private @__device_0_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @__device_0_executable_0_main_dispatch_0 : !hal.executable
  util.initializer {
    %0 = util.null : !hal.executable
    %c14_i32 = arith.constant 14 : i32
    %c-1 = arith.constant -1 : index
    %c5_i32 = arith.constant 5 : i32
    %false = arith.constant false
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %1 = util.null : !hal.device
    %device_count = hal.devices.count : index
    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
    %5 = util.cmp.eq %4, %1 : !hal.device
    %6 = arith.cmpi slt, %2, %device_count : index
    %7 = arith.andi %5, %6 : i1
    cf.cond_br %7, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %device_n = hal.devices.get %2 : !hal.device
    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
  ^bb3:  // pred: ^bb2
    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.br ^bb4(%value_1 : i1)
  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
    %9 = arith.cmpi eq, %3, %c0 : index
    %10 = arith.select %8, %c1, %c0 : index
    %11 = arith.addi %3, %10 : index
    %12 = arith.andi %8, %9 : i1
    %13 = arith.select %12, %device_n, %1 : !hal.device
    %14 = arith.addi %2, %c1 : index
    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
  ^bb5:  // pred: ^bb1
    cf.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    util.status.check_ok %c5_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%4 : !hal.device) flags(Indirect) bindings([#hal.descriptor_set.binding<0, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<1, storage_buffer, "ReadOnly|Indirect">, #hal.descriptor_set.binding<2, storage_buffer, Indirect>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%4 : !hal.device) push_constants(8) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    %15 = arith.select %value_3, %c0, %c-1 : index
    %16 = arith.cmpi eq, %15, %c0 : index
    util.global.store %4, @__device_0 : !hal.device
    cf.cond_br %16, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %exe = hal.executable.create device(%4 : !hal.device) target(@main_dispatch_0::@embedded_elf_x86_64) layouts([%pipeline_layout]) : !hal.executable
    cf.br ^bb10(%exe : !hal.executable)
  ^bb9:  // pred: ^bb7
    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    cf.br ^bb10(%0 : !hal.executable)
  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
    util.global.store %17, @__device_0_executable_0_main_dispatch_0 : !hal.executable
    util.global.store %pipeline_layout, @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    util.return
  }
  hal.executable private @main_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = arith.constant 9 : index
    %c-1_i32 = arith.constant -1 : i32
    %c7 = arith.constant 7 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %0 = util.null : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c32_i64 = arith.constant 32 : i64
    %c2016 = arith.constant 2016 : index
    %c2400 = arith.constant 2400 : index
    %c0 = arith.constant 0 : index
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c6 = arith.constant 6 : index
    %__device_0 = util.global.load immutable @__device_0 : !hal.device
    %__device_0_pipeline_layout_0 = util.global.load immutable @__device_0_pipeline_layout_0 : !hal.pipeline_layout
    %__device_0_executable_0_main_dispatch_0 = util.global.load immutable @__device_0_executable_0_main_dispatch_0 : !hal.executable
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %2 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %3 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[2] : index
    %4 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[3] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%1, %2, %3, %4]) type(%element_type_f32) encoding(%dense_row_major)
    %5 = arith.muli %1, %c4 : index
    %6 = arith.muli %5, %2 : index
    %7 = arith.muli %6, %3 : index
    %8 = arith.muli %7, %4 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4, %c6, %c5, %c5]) type(%element_type_f32) encoding(%dense_row_major)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c2400) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c2016}
    %9 = arith.index_castui %1 : index to i64
    %10 = arith.trunci %9 : i64 to i32
    %11 = arith.shrui %9, %c32_i64 : i64
    %12 = arith.trunci %11 : i64 to i32
    %13 = arith.index_castui %2 : index to i64
    %14 = arith.trunci %13 : i64 to i32
    %15 = arith.shrui %13, %c32_i64 : i64
    %16 = arith.trunci %15 : i64 to i32
    %17 = arith.index_castui %3 : index to i64
    %18 = arith.trunci %17 : i64 to i32
    %19 = arith.shrui %17, %c32_i64 : i64
    %20 = arith.trunci %19 : i64 to i32
    %21 = arith.index_castui %4 : index to i64
    %22 = arith.trunci %21 : i64 to i32
    %23 = arith.shrui %21, %c32_i64 : i64
    %24 = arith.trunci %23 : i64 to i32
    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") affinity(%c-1_i64) : !hal.command_buffer
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout) offset(0) values([%10, %12, %14, %16, %18, %20, %22, %24]) : i32, i32, i32, i32, i32, i32, i32, i32
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%__device_0_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %8], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c2400], 
      %c2 = (%transient_buffer : !hal.buffer)[%c0, %c2016]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_main_dispatch_0 : !hal.executable)[%c0] workgroups([%c7, %c4, %c1]) flags("None")
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands([%cmd])
    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c2016] shape([%c2, %c4, %c7, %c9]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
    util.return %view : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass (iree-vm-conversion) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1 = vm.const.i64 1
      %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %5 = vm.and.i32 %req, %slt : i32
      vm.cond_br %5, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %6 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
      %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*"
      %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %7#1 : i64
      %zero_3 = vm.const.i32.zero
      %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
      %c1_4 = vm.const.i32 1
      vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
      %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
      %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %9#1 : i64
      %zero_8 = vm.const.i32.zero
      %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32
      %c1_9 = vm.const.i32 1
      vm.br ^bb4(%10 : i32)
    ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %12 = vm.select.i64 %11, %c1, %zero_0 : i64
      %13 = vm.add.i64 %3, %12 : i64
      %14 = vm.and.i32 %11, %eq : i32
      %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
      %15 = vm.add.i64 %2, %c1 : i64
      vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %req, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
      vm.br ^bb7
    ^bb7:  // 2 preds: ^bb5, ^bb6
      %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
      %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
      %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_13 = vm.cmp.nz.i64 %16#1 : i64
      %zero_14 = vm.const.i32.zero
      %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32
      %c1_15 = vm.const.i32 1
      %c1_16 = vm.const.i32 1
      %zero_17 = vm.const.i32.zero
      %c7 = vm.const.i32 7
      %c3 = vm.const.i32 3
      %c1_18 = vm.const.i32 1
      %c7_19 = vm.const.i32 7
      %c3_20 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c7_21 = vm.const.i32 7
      %c2_22 = vm.const.i32 2
      %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %c8 = vm.const.i32 8
      %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
      %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_25, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
      %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
      %null_28 = vm.const.ref.zero : !vm.buffer
      %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb10(%ref_29 : !vm.ref<!hal.executable>)
    ^bb9:  // pred: ^bb7
      vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      vm.br ^bb10(%null : !vm.ref<!hal.executable>)
    ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
      vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7 = vm.const.i64 7
      %c2 = vm.const.i64 2
      %c1 = vm.const.i64 1
      %zero = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_0 = vm.const.i64 -1
      %c32 = vm.const.i64 32
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %zero_1 = vm.const.i64.zero
      %c5 = vm.const.i64 5
      %c4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %zero_2 = vm.const.i32.zero
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c1_3 = vm.const.i32 1
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c2_4 = vm.const.i32 2
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c3 = vm.const.i32 3
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c553648160 = vm.const.i32 553648160
      %c1_5 = vm.const.i32 1
      %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0"
      vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
      %c16 = vm.const.i32 16
      %c3075 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1"
      vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
      %c16_11 = vm.const.i32 16
      %c3075_12 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %zero_13 = vm.const.i32.zero
      %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %zero_15 = vm.const.i32.zero
      %c48 = vm.const.i32 48
      %c3075_16 = vm.const.i32 3075
      %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %c32_18 = vm.const.i32 32
      %9 = vm.shr.i64.u %0, %c32_18 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %c32_19 = vm.const.i32 32
      %12 = vm.shr.i64.u %1, %c32_19 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %c32_20 = vm.const.i32 32
      %15 = vm.shr.i64.u %2, %c32_20 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %c32_21 = vm.const.i32 32
      %18 = vm.shr.i64.u %3, %c32_21 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %c1_22 = vm.const.i32 1
      %c3_23 = vm.const.i32 3
      %zero_24 = vm.const.i32.zero
      %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      %zero_26 = vm.const.i32.zero
      vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      %zero_27 = vm.const.i32.zero
      %zero_28 = vm.const.i32.zero
      %zero_29 = vm.const.i32.zero
      %c1_30 = vm.const.i32 1
      %zero_31 = vm.const.i32.zero
      %c2_32 = vm.const.i32 2
      %zero_33 = vm.const.i32.zero
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      %zero_34 = vm.const.i32.zero
      %c7_35 = vm.const.i32 7
      %c4_36 = vm.const.i32 4
      %c1_37 = vm.const.i32 1
      %zero_38 = vm.const.i64.zero
      vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      %c28 = vm.const.i32 28
      %c13 = vm.const.i32 13
      %zero_39 = vm.const.i32.zero
      vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref<!hal.command_buffer>) -> ()
      %zero_40 = vm.const.i32.zero
      %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_fail %20, "failed to wait on timepoint"
      %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_42 : !vm.ref<!hal.buffer_view>
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
    %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*"
    %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_3 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
    %c1_4 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %9#1 : i64
    %zero_8 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32
    %c1_9 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_13 = vm.cmp.nz.i64 %16#1 : i64
    %zero_14 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32
    %c1_15 = vm.const.i32 1
    %c1_16 = vm.const.i32 1
    %zero_17 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_18 = vm.const.i32 1
    %c7_19 = vm.const.i32 7
    %c3_20 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_21 = vm.const.i32 7
    %c2_22 = vm.const.i32 2
    %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_25, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %null_28 = vm.const.ref.zero : !vm.buffer
    %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_29 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0"
    vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1"
    vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16_11 = vm.const.i32 16
    %c3075_12 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_13 = vm.const.i32.zero
    %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_15 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_16 = vm.const.i32 3075
    %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_18 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_19 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_19 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_20 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_20 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_21 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_21 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_22 = vm.const.i32 1
    %c3_23 = vm.const.i32 3
    %zero_24 = vm.const.i32.zero
    %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_26 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_27 = vm.const.i32.zero
    %zero_28 = vm.const.i32.zero
    %zero_29 = vm.const.i32.zero
    %c1_30 = vm.const.i32 1
    %zero_31 = vm.const.i32.zero
    %c2_32 = vm.const.i32 2
    %zero_33 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_34 = vm.const.i32.zero
    %c7_35 = vm.const.i32 7
    %c4_36 = vm.const.i32 4
    %c1_37 = vm.const.i32 1
    %zero_38 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_39 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_40 = vm.const.i32.zero
    %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_42 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
    %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*"
    %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_3 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
    %c1_4 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %9#1 : i64
    %zero_8 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32
    %c1_9 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_13 = vm.cmp.nz.i64 %16#1 : i64
    %zero_14 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32
    %c1_15 = vm.const.i32 1
    %c1_16 = vm.const.i32 1
    %zero_17 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_18 = vm.const.i32 1
    %c7_19 = vm.const.i32 7
    %c3_20 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_21 = vm.const.i32 7
    %c2_22 = vm.const.i32 2
    %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_25, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %null_28 = vm.const.ref.zero : !vm.buffer
    %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_29 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0"
    vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1"
    vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16_11 = vm.const.i32 16
    %c3075_12 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_13 = vm.const.i32.zero
    %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_15 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_16 = vm.const.i32 3075
    %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_18 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_19 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_19 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_20 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_20 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_21 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_21 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_22 = vm.const.i32 1
    %c3_23 = vm.const.i32 3
    %zero_24 = vm.const.i32.zero
    %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_26 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_27 = vm.const.i32.zero
    %zero_28 = vm.const.i32.zero
    %zero_29 = vm.const.i32.zero
    %c1_30 = vm.const.i32 1
    %zero_31 = vm.const.i32.zero
    %c2_32 = vm.const.i32 2
    %zero_33 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_34 = vm.const.i32.zero
    %c7_35 = vm.const.i32 7
    %c4_36 = vm.const.i32 4
    %c1_37 = vm.const.i32 1
    %zero_38 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_39 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_40 = vm.const.i32.zero
    %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_42 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump Before mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %buffer = vm.rodata.inline "_utf8_hal_device_id_D0F1B3E9D63E707C" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
    %buffer_2 = vm.rodata.inline "_utf8_local_8DC315A014BAFA34" {alignment = 1 : i64} : !vm.buffer = "local*"
    %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_2) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_3 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
    %c1_4 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %buffer_5 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_6 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_5, %buffer_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %9#1 : i64
    %zero_8 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_7, %zero_8 : i32
    %c1_9 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_10 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_10 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %buffer_11 = vm.rodata.inline "_utf8_hal_executable_format_1F9665C75F0004D3" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
    %buffer_12 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %16:2 = vm.call @hal.device.query.i64(%4, %buffer_11, %buffer_12) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_13 = vm.cmp.nz.i64 %16#1 : i64
    %zero_14 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_13, %zero_14 : i32
    %c1_15 = vm.const.i32 1
    %c1_16 = vm.const.i32 1
    %zero_17 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_18 = vm.const.i32 1
    %c7_19 = vm.const.i32 7
    %c3_20 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_21 = vm.const.i32 7
    %c2_22 = vm.const.i32 2
    %ref_23 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_16, [(%zero_17, %c7, %c3), (%c1_18, %c7_19, %c3_20), (%c2, %c7_21, %c2_22)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_24 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_23]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_25 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_25, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %buffer_26 = vm.rodata.inline "main_dispatch_0_embedded_elf_x86_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    %buffer_27 = vm.rodata.inline "_utf8_embedded_elf_x86_64_11EF7D6636570B50" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-x86_64"
    %null_28 = vm.const.ref.zero : !vm.buffer
    %ref_29 = vm.call.variadic @hal.executable.create(%4, %buffer_27, %buffer_26, %null_28, [%ref_24]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_29 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_24, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %buffer = vm.rodata.inline "_utf8_input0_DA9A70D360954439" {alignment = 1 : i64} : !vm.buffer = "input0"
    vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %buffer_7 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %buffer_7, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %buffer_8 = vm.rodata.inline "_utf8_input1_FDCC539DA203DDD3" {alignment = 1 : i64} : !vm.buffer = "input1"
    vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_8, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %buffer_10 = vm.rodata.inline "_utf8_tensor_41A152EEDB094D7A" {alignment = 1 : i64} : !vm.buffer = "tensor"
    %c16_11 = vm.const.i32 16
    %c3075_12 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_6, %c2400, %c16_11, %c3075_12) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_13 = vm.const.i32.zero
    %ref_14 = vm.call @hal.fence.create(%__device_0, %zero_13) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_15 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_16 = vm.const.i32 3075
    %ref_17 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_14, %zero_15, %c48, %c3075_16, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_18 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_19 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_19 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_20 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_20 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_21 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_21 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_22 = vm.const.i32 1
    %c3_23 = vm.const.i32 3
    %zero_24 = vm.const.i32.zero
    %ref_25 = vm.call @hal.command_buffer.create(%__device_0, %c1_22, %c3_23, %c-1_0, %zero_24) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_26 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_25, %__device_0_pipeline_layout_0, %zero_26, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_27 = vm.const.i32.zero
    %zero_28 = vm.const.i32.zero
    %zero_29 = vm.const.i32.zero
    %c1_30 = vm.const.i32 1
    %zero_31 = vm.const.i32.zero
    %c2_32 = vm.const.i32 2
    %zero_33 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_25, %__device_0_pipeline_layout_0, %zero_27, [(%zero_28, %zero_29, %ref, %zero_1, %7), (%c1_30, %zero_31, %ref_9, %zero_1, %c2400), (%c2_32, %zero_33, %ref_17, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_34 = vm.const.i32.zero
    %c7_35 = vm.const.i32 7
    %c4_36 = vm.const.i32 4
    %c1_37 = vm.const.i32 1
    %zero_38 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_25, %__device_0_executable_0_main_dispatch_0, %zero_34, %c7_35, %c4_36, %c1_37, %zero_38) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_39 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_25, %c28, %c13, %zero_39) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_25) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_40 = vm.const.i32.zero
    %ref_41 = vm.call @hal.fence.create(%__device_0, %zero_40) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_14, %ref_41, [%ref_25]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_41]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_42 = vm.call.variadic @hal.buffer_view.create(%ref_17, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_42 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3_0 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_2 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32
    %c1_3 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_4 = vm.cmp.nz.i64 %9#1 : i64
    %zero_5 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32
    %c1_6 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %_utf8_hal_executable_format_1F9665C75F0004D3_0 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3_0 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 : !vm.buffer
    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_0, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_8 = vm.cmp.nz.i64 %16#1 : i64
    %zero_9 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_8, %zero_9 : i32
    %c1_10 = vm.const.i32 1
    %c1_11 = vm.const.i32 1
    %zero_12 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_13 = vm.const.i32 1
    %c7_14 = vm.const.i32 7
    %c3_15 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_16 = vm.const.i32 7
    %c2_17 = vm.const.i32 2
    %ref_18 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_11, [(%zero_12, %c7, %c3), (%c1_13, %c7_14, %c3_15), (%c2, %c7_16, %c2_17)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_19 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_18]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_20 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_20, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 : !vm.buffer
    %null_21 = vm.const.ref.zero : !vm.buffer
    %ref_22 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2, %main_dispatch_0_embedded_elf_x86_64, %null_21, [%ref_19]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_22 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_19, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A_3 {alignment = 1 : i64} "tensor"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_41A152EEDB094D7A_3 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A_3 : !vm.buffer
    %c16_8 = vm.const.i32 16
    %c3075_9 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_3, %ref_6, %c2400, %c16_8, %c3075_9) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_10 = vm.const.i32.zero
    %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_12 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_13 = vm.const.i32 3075
    %ref_14 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_11, %zero_12, %c48, %c3075_13, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_15 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_15 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_16 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_16 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_17 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_17 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_18 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_19 = vm.const.i32 1
    %c3_20 = vm.const.i32 3
    %zero_21 = vm.const.i32.zero
    %ref_22 = vm.call @hal.command_buffer.create(%__device_0, %c1_19, %c3_20, %c-1_0, %zero_21) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_23 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_22, %__device_0_pipeline_layout_0, %zero_23, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_24 = vm.const.i32.zero
    %zero_25 = vm.const.i32.zero
    %zero_26 = vm.const.i32.zero
    %c1_27 = vm.const.i32 1
    %zero_28 = vm.const.i32.zero
    %c2_29 = vm.const.i32 2
    %zero_30 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_22, %__device_0_pipeline_layout_0, %zero_24, [(%zero_25, %zero_26, %ref, %zero_1, %7), (%c1_27, %zero_28, %ref_7, %zero_1, %c2400), (%c2_29, %zero_30, %ref_14, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_31 = vm.const.i32.zero
    %c7_32 = vm.const.i32 7
    %c4_33 = vm.const.i32 4
    %c1_34 = vm.const.i32 1
    %zero_35 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_22, %__device_0_executable_0_main_dispatch_0, %zero_31, %c7_32, %c4_33, %c1_34, %zero_35) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_36 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_22, %c28, %c13, %zero_36) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_22) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_37 = vm.const.i32.zero
    %ref_38 = vm.call @hal.fence.create(%__device_0, %zero_37) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_11, %ref_38, [%ref_22]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_38]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_39 = vm.call.variadic @hal.buffer_view.create(%ref_14, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_39 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump Before mlir::iree_compiler::IREE::VM::DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3_0 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_2 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32
    %c1_3 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_4 = vm.cmp.nz.i64 %9#1 : i64
    %zero_5 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32
    %c1_6 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %_utf8_hal_executable_format_1F9665C75F0004D3_0 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3_0 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_1 : !vm.buffer
    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_0, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_1) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_8 = vm.cmp.nz.i64 %16#1 : i64
    %zero_9 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_8, %zero_9 : i32
    %c1_10 = vm.const.i32 1
    %c1_11 = vm.const.i32 1
    %zero_12 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_13 = vm.const.i32 1
    %c7_14 = vm.const.i32 7
    %c3_15 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_16 = vm.const.i32 7
    %c2_17 = vm.const.i32 2
    %ref_18 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_11, [(%zero_12, %c7, %c3), (%c1_13, %c7_14, %c3_15), (%c2, %c7_16, %c2_17)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_19 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_18]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_20 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_20, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50_2 : !vm.buffer
    %null_21 = vm.const.ref.zero : !vm.buffer
    %ref_22 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_2, %main_dispatch_0_embedded_elf_x86_64, %null_21, [%ref_19]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_22 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_19, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A_3 {alignment = 1 : i64} "tensor"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_41A152EEDB094D7A_3 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A_3 : !vm.buffer
    %c16_8 = vm.const.i32 16
    %c3075_9 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_3, %ref_6, %c2400, %c16_8, %c3075_9) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_10 = vm.const.i32.zero
    %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_12 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_13 = vm.const.i32 3075
    %ref_14 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_11, %zero_12, %c48, %c3075_13, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_15 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_15 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_16 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_16 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_17 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_17 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_18 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_19 = vm.const.i32 1
    %c3_20 = vm.const.i32 3
    %zero_21 = vm.const.i32.zero
    %ref_22 = vm.call @hal.command_buffer.create(%__device_0, %c1_19, %c3_20, %c-1_0, %zero_21) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_23 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_22, %__device_0_pipeline_layout_0, %zero_23, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_24 = vm.const.i32.zero
    %zero_25 = vm.const.i32.zero
    %zero_26 = vm.const.i32.zero
    %c1_27 = vm.const.i32 1
    %zero_28 = vm.const.i32.zero
    %c2_29 = vm.const.i32 2
    %zero_30 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_22, %__device_0_pipeline_layout_0, %zero_24, [(%zero_25, %zero_26, %ref, %zero_1, %7), (%c1_27, %zero_28, %ref_7, %zero_1, %c2400), (%c2_29, %zero_30, %ref_14, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_31 = vm.const.i32.zero
    %c7_32 = vm.const.i32 7
    %c4_33 = vm.const.i32 4
    %c1_34 = vm.const.i32 1
    %zero_35 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_22, %__device_0_executable_0_main_dispatch_0, %zero_31, %c7_32, %c4_33, %c1_34, %zero_35) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_36 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_22, %c28, %c13, %zero_36) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_22) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_37 = vm.const.i32.zero
    %ref_38 = vm.call @hal.fence.create(%__device_0, %zero_37) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_11, %ref_38, [%ref_22]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_38]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_39 = vm.call.variadic @hal.buffer_view.create(%ref_14, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_39 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1 = vm.const.i64 1
    %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %5 = vm.and.i32 %req, %slt : i32
    vm.cond_br %5, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %7#1 : i64
    %zero_2 = vm.const.i32.zero
    %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32
    %c1_3 = vm.const.i32 1
    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_4 = vm.cmp.nz.i64 %9#1 : i64
    %zero_5 = vm.const.i32.zero
    %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32
    %c1_6 = vm.const.i32 1
    vm.br ^bb4(%10 : i32)
  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %12 = vm.select.i64 %11, %c1, %zero_0 : i64
    %13 = vm.add.i64 %3, %12 : i64
    %14 = vm.and.i32 %11, %eq : i32
    %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
    %15 = vm.add.i64 %2, %c1 : i64
    vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %req, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
    vm.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    %_utf8_hal_executable_format_1F9665C75F0004D3_8 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_8, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_10 = vm.cmp.nz.i64 %16#1 : i64
    %zero_11 = vm.const.i32.zero
    %17 = vm.select.i32 %16#0, %nz_10, %zero_11 : i32
    %c1_12 = vm.const.i32 1
    %c1_13 = vm.const.i32 1
    %zero_14 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c3 = vm.const.i32 3
    %c1_15 = vm.const.i32 1
    %c7_16 = vm.const.i32 7
    %c3_17 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c7_18 = vm.const.i32 7
    %c2_19 = vm.const.i32 2
    %ref_20 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_13, [(%zero_14, %c7, %c3), (%c1_15, %c7_16, %c3_17), (%c2, %c7_18, %c2_19)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %c8 = vm.const.i32 8
    %ref_21 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_20]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
    %eq_22 = vm.cmp.eq.i64 %18, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_22, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %null_24 = vm.const.ref.zero : !vm.buffer
    %ref_25 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23, %main_dispatch_0_embedded_elf_x86_64, %null_24, [%ref_21]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb10(%ref_25 : !vm.ref<!hal.executable>)
  ^bb9:  // pred: ^bb7
    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
    vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_21, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7 = vm.const.i64 7
    %c2 = vm.const.i64 2
    %c1 = vm.const.i64 1
    %zero = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_0 = vm.const.i64 -1
    %c32 = vm.const.i64 32
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %zero_1 = vm.const.i64.zero
    %c5 = vm.const.i64 5
    %c4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %zero_2 = vm.const.i32.zero
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c1_3 = vm.const.i32 1
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c2_4 = vm.const.i32 2
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c3 = vm.const.i32 3
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %c553648160 = vm.const.i32 553648160
    %c1_5 = vm.const.i32 1
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    %c16 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_41A152EEDB094D7A_8 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    %c16_9 = vm.const.i32 16
    %c3075_10 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_8, %ref_6, %c2400, %c16_9, %c3075_10) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %zero_11 = vm.const.i32.zero
    %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %zero_13 = vm.const.i32.zero
    %c48 = vm.const.i32 48
    %c3075_14 = vm.const.i32 3075
    %ref_15 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_12, %zero_13, %c48, %c3075_14, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %c32_16 = vm.const.i32 32
    %9 = vm.shr.i64.u %0, %c32_16 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %c32_17 = vm.const.i32 32
    %12 = vm.shr.i64.u %1, %c32_17 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %c32_18 = vm.const.i32 32
    %15 = vm.shr.i64.u %2, %c32_18 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %c32_19 = vm.const.i32 32
    %18 = vm.shr.i64.u %3, %c32_19 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %c1_20 = vm.const.i32 1
    %c3_21 = vm.const.i32 3
    %zero_22 = vm.const.i32.zero
    %ref_23 = vm.call @hal.command_buffer.create(%__device_0, %c1_20, %c3_21, %c-1_0, %zero_22) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    %zero_24 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_constants(%ref_23, %__device_0_pipeline_layout_0, %zero_24, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    %zero_25 = vm.const.i32.zero
    %zero_26 = vm.const.i32.zero
    %zero_27 = vm.const.i32.zero
    %c1_28 = vm.const.i32 1
    %zero_29 = vm.const.i32.zero
    %c2_30 = vm.const.i32 2
    %zero_31 = vm.const.i32.zero
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_23, %__device_0_pipeline_layout_0, %zero_25, [(%zero_26, %zero_27, %ref, %zero_1, %7), (%c1_28, %zero_29, %ref_7, %zero_1, %c2400), (%c2_30, %zero_31, %ref_15, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_32 = vm.const.i32.zero
    %c7_33 = vm.const.i32 7
    %c4_34 = vm.const.i32 4
    %c1_35 = vm.const.i32 1
    %zero_36 = vm.const.i64.zero
    vm.call @hal.command_buffer.dispatch(%ref_23, %__device_0_executable_0_main_dispatch_0, %zero_32, %c7_33, %c4_34, %c1_35, %zero_36) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_37 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_23, %c28, %c13, %zero_37) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_23) : (!vm.ref<!hal.command_buffer>) -> ()
    %zero_38 = vm.const.i32.zero
    %ref_39 = vm.call @hal.fence.create(%__device_0, %zero_38) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_12, %ref_39, [%ref_23]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_39]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_fail %20, "failed to wait on timepoint"
    %ref_40 = vm.call.variadic @hal.buffer_view.create(%ref_15, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_40 : !vm.ref<!hal.buffer_view>
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1 = vm.const.i64 1
      %null_1 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_1 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %req = vm.cmp.eq.ref %4, %null_1 : !vm.ref<!hal.device>
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %5 = vm.and.i32 %req, %slt : i32
      vm.cond_br %5, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %6 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %7#1 : i64
      %zero_2 = vm.const.i32.zero
      %8 = vm.select.i32 %7#0, %nz, %zero_2 : i32
      %c1_3 = vm.const.i32 1
      vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_4 = vm.cmp.nz.i64 %9#1 : i64
      %zero_5 = vm.const.i32.zero
      %10 = vm.select.i32 %9#0, %nz_4, %zero_5 : i32
      %c1_6 = vm.const.i32 1
      vm.br ^bb4(%10 : i32)
    ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %12 = vm.select.i64 %11, %c1, %zero_0 : i64
      %13 = vm.add.i64 %3, %12 : i64
      %14 = vm.and.i32 %11, %eq : i32
      %ref_7 = vm.select.ref %14, %ref, %null_1 : !vm.ref<!hal.device>
      %15 = vm.add.i64 %2, %c1 : i64
      vm.br ^bb1(%15, %13, %ref_7 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %req, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.cond_fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>"
      vm.br ^bb7
    ^bb7:  // 2 preds: ^bb5, ^bb6
      %_utf8_hal_executable_format_1F9665C75F0004D3_8 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_8, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_9) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_10 = vm.cmp.nz.i64 %16#1 : i64
      %zero_11 = vm.const.i32.zero
      %17 = vm.select.i32 %16#0, %nz_10, %zero_11 : i32
      %c1_12 = vm.const.i32 1
      %c1_13 = vm.const.i32 1
      %zero_14 = vm.const.i32.zero
      %c7 = vm.const.i32 7
      %c3 = vm.const.i32 3
      %c1_15 = vm.const.i32 1
      %c7_16 = vm.const.i32 7
      %c3_17 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c7_18 = vm.const.i32 7
      %c2_19 = vm.const.i32 2
      %ref_20 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1_13, [(%zero_14, %c7, %c3), (%c1_15, %c7_16, %c3_17), (%c2, %c7_18, %c2_19)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %c8 = vm.const.i32 8
      %ref_21 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_20]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %18 = vm.select.i64 %17, %zero_0, %c-1 : i64
      %eq_22 = vm.cmp.eq.i64 %18, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_22, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %null_24 = vm.const.ref.zero : !vm.buffer
      %ref_25 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_23, %main_dispatch_0_embedded_elf_x86_64, %null_24, [%ref_21]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb10(%ref_25 : !vm.ref<!hal.executable>)
    ^bb9:  // pred: ^bb7
      vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
      vm.br ^bb10(%null : !vm.ref<!hal.executable>)
    ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
      vm.global.store.ref %19, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_21, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7 = vm.const.i64 7
      %c2 = vm.const.i64 2
      %c1 = vm.const.i64 1
      %zero = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_0 = vm.const.i64 -1
      %c32 = vm.const.i64 32
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %zero_1 = vm.const.i64.zero
      %c5 = vm.const.i64 5
      %c4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %zero_2 = vm.const.i32.zero
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero_2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c1_3 = vm.const.i32 1
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1_3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c2_4 = vm.const.i32 2
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2_4) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c3 = vm.const.i32 3
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %c553648160 = vm.const.i32 553648160
      %c1_5 = vm.const.i32 1
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1_5, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_6 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      %c16 = vm.const.i32 16
      %c3075 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_6, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1_5, [%c4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %_utf8_tensor_41A152EEDB094D7A_8 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      %c16_9 = vm.const.i32 16
      %c3075_10 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_41A152EEDB094D7A_8, %ref_6, %c2400, %c16_9, %c3075_10) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %zero_11 = vm.const.i32.zero
      %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %zero_13 = vm.const.i32.zero
      %c48 = vm.const.i32 48
      %c3075_14 = vm.const.i32 3075
      %ref_15 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_0, %null, %ref_12, %zero_13, %c48, %c3075_14, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %c32_16 = vm.const.i32 32
      %9 = vm.shr.i64.u %0, %c32_16 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %c32_17 = vm.const.i32 32
      %12 = vm.shr.i64.u %1, %c32_17 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %c32_18 = vm.const.i32 32
      %15 = vm.shr.i64.u %2, %c32_18 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %c32_19 = vm.const.i32 32
      %18 = vm.shr.i64.u %3, %c32_19 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %c1_20 = vm.const.i32 1
      %c3_21 = vm.const.i32 3
      %zero_22 = vm.const.i32.zero
      %ref_23 = vm.call @hal.command_buffer.create(%__device_0, %c1_20, %c3_21, %c-1_0, %zero_22) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      %zero_24 = vm.const.i32.zero
      vm.call.variadic @hal.command_buffer.push_constants(%ref_23, %__device_0_pipeline_layout_0, %zero_24, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      %zero_25 = vm.const.i32.zero
      %zero_26 = vm.const.i32.zero
      %zero_27 = vm.const.i32.zero
      %c1_28 = vm.const.i32 1
      %zero_29 = vm.const.i32.zero
      %c2_30 = vm.const.i32 2
      %zero_31 = vm.const.i32.zero
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_23, %__device_0_pipeline_layout_0, %zero_25, [(%zero_26, %zero_27, %ref, %zero_1, %7), (%c1_28, %zero_29, %ref_7, %zero_1, %c2400), (%c2_30, %zero_31, %ref_15, %zero_1, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      %zero_32 = vm.const.i32.zero
      %c7_33 = vm.const.i32 7
      %c4_34 = vm.const.i32 4
      %c1_35 = vm.const.i32 1
      %zero_36 = vm.const.i64.zero
      vm.call @hal.command_buffer.dispatch(%ref_23, %__device_0_executable_0_main_dispatch_0, %zero_32, %c7_33, %c4_34, %c1_35, %zero_36) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      %c28 = vm.const.i32 28
      %c13 = vm.const.i32 13
      %zero_37 = vm.const.i32.zero
      vm.call @hal.command_buffer.execution_barrier(%ref_23, %c28, %c13, %zero_37) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_23) : (!vm.ref<!hal.command_buffer>) -> ()
      %zero_38 = vm.const.i32.zero
      %ref_39 = vm.call @hal.fence.create(%__device_0, %zero_38) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_0, %ref_12, %ref_39, [%ref_23]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_39]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_fail %20, "failed to wait on timepoint"
      %ref_40 = vm.call.variadic @hal.buffer_view.create(%ref_15, %zero_1, %c2016, %c553648160, %c1_5, [%c2, %c4, %c7, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_40 : !vm.ref<!hal.buffer_view>
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %ref_12 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_12, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %_utf8_tensor_41A152EEDB094D7A_7 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A_7, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_8 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_9 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_8, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_10 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_10, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_10, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_9, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_10, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_10, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_10) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_11 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_8, %ref_11, [%ref_10]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_11]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2(%20 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_12 = vm.call.variadic @hal.buffer_view.create(%ref_9, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_12 : !vm.ref<!hal.buffer_view>
    ^bb2(%21: i32):  // pred: ^bb0
      vm.fail %21, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %ref_12 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_11, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_12, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %_utf8_tensor_41A152EEDB094D7A_7 = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A_7, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_8 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_9 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_8, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_10 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_10, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_10, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_9, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_10, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_10, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_10) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_11 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_8, %ref_11, [%ref_10]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_11]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2(%20 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_12 = vm.call.variadic @hal.buffer_view.create(%ref_9, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_12 : !vm.ref<!hal.buffer_view>
    ^bb2(%21: i32):  // pred: ^bb0
      vm.fail %21, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2(%20 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2(%21: i32):  // pred: ^bb0
      vm.fail %21, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2(%20 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2(%21: i32):  // pred: ^bb0
      vm.fail %21, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
  vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump Before Inliner (inline) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
vm.initializer {
  %null = vm.const.ref.zero : !vm.buffer
  %c8 = vm.const.i32 8
  %c2 = vm.const.i32 2
  %c3 = vm.const.i32 3
  %c7 = vm.const.i32 7
  %c1 = vm.const.i32 1
  %c14 = vm.const.i32 14
  %c-1 = vm.const.i64 -1
  %c5 = vm.const.i32 5
  %zero = vm.const.i32.zero
  %zero_0 = vm.const.i64.zero
  %c1_1 = vm.const.i64 1
  %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
  %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
  %1 = vm.ext.i32.i64.s %0 : i32 -> i64
  vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
  %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
  %5 = vm.xor.i32 %rnz, %c1 : i32
  %slt = vm.cmp.lt.i64.s %2, %1 : i64
  %6 = vm.and.i32 %5, %slt : i32
  vm.cond_br %6, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %7 = vm.trunc.i64.i32 %2 : i64 -> i32
  %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
  %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
  %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
  %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz = vm.cmp.nz.i64 %8#1 : i64
  %9 = vm.select.i32 %8#0, %nz, %zero : i32
  vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
^bb3:  // pred: ^bb2
  %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_3 = vm.cmp.nz.i64 %10#1 : i64
  %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
  vm.br ^bb4(%11 : i32)
^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
  %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
  %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
  %14 = vm.add.i64 %3, %13 : i64
  %15 = vm.and.i32 %12, %eq : i32
  %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
  %16 = vm.add.i64 %2, %c1_1 : i64
  vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
^bb5:  // pred: ^bb1
  vm.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
^bb7:  // pred: ^bb5
  %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_7 = vm.cmp.nz.i64 %17#1 : i64
  %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
  %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
  %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
  %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
  vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
  vm.cond_br %eq_10, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
  %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
  vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.return
^bb9:  // pred: ^bb7
  vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
vm.initializer {
  %null = vm.const.ref.zero : !vm.buffer
  %c8 = vm.const.i32 8
  %c2 = vm.const.i32 2
  %c3 = vm.const.i32 3
  %c7 = vm.const.i32 7
  %c1 = vm.const.i32 1
  %c14 = vm.const.i32 14
  %c-1 = vm.const.i64 -1
  %c5 = vm.const.i32 5
  %zero = vm.const.i32.zero
  %zero_0 = vm.const.i64.zero
  %c1_1 = vm.const.i64 1
  %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
  %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
  %1 = vm.ext.i32.i64.s %0 : i32 -> i64
  vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
  %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
  %5 = vm.xor.i32 %rnz, %c1 : i32
  %slt = vm.cmp.lt.i64.s %2, %1 : i64
  %6 = vm.and.i32 %5, %slt : i32
  vm.cond_br %6, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %7 = vm.trunc.i64.i32 %2 : i64 -> i32
  %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
  %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
  %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
  %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz = vm.cmp.nz.i64 %8#1 : i64
  %9 = vm.select.i32 %8#0, %nz, %zero : i32
  vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
^bb3:  // pred: ^bb2
  %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_3 = vm.cmp.nz.i64 %10#1 : i64
  %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
  vm.br ^bb4(%11 : i32)
^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
  %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
  %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
  %14 = vm.add.i64 %3, %13 : i64
  %15 = vm.and.i32 %12, %eq : i32
  %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
  %16 = vm.add.i64 %2, %c1_1 : i64
  vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
^bb5:  // pred: ^bb1
  vm.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
^bb7:  // pred: ^bb5
  %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_7 = vm.cmp.nz.i64 %17#1 : i64
  %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
  %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
  %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
  %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
  vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
  vm.cond_br %eq_10, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
  %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
  vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.return
^bb9:  // pred: ^bb7
  vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13 = vm.const.i32 13
  %c28 = vm.const.i32 28
  %c4 = vm.const.i32 4
  %c7 = vm.const.i32 7
  %c32 = vm.const.i32 32
  %c48 = vm.const.i32 48
  %c3075 = vm.const.i32 3075
  %c16 = vm.const.i32 16
  %c553648160 = vm.const.i32 553648160
  %c3 = vm.const.i32 3
  %c2 = vm.const.i32 2
  %c1 = vm.const.i32 1
  %zero = vm.const.i32.zero
  %c9 = vm.const.i64 9
  %c-1 = vm.const.i32 -1
  %c7_0 = vm.const.i64 7
  %c2_1 = vm.const.i64 2
  %zero_2 = vm.const.i64.zero
  %null = vm.const.ref.zero : !vm.ref<!hal.fence>
  %c-1_3 = vm.const.i64 -1
  %c2016 = vm.const.i64 2016
  %c2400 = vm.const.i64 2400
  %c5 = vm.const.i64 5
  %c4_4 = vm.const.i64 4
  %c6 = vm.const.i64 6
  %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
  %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %4 = vm.mul.i64 %0, %c4_4 : i64
  %5 = vm.mul.i64 %4, %1 : i64
  %6 = vm.mul.i64 %5, %2 : i64
  %7 = vm.mul.i64 %6, %3 : i64
  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
  %8 = vm.trunc.i64.i32 %0 : i64 -> i32
  %9 = vm.shr.i64.u %0, %c32 : i64
  %10 = vm.trunc.i64.i32 %9 : i64 -> i32
  %11 = vm.trunc.i64.i32 %1 : i64 -> i32
  %12 = vm.shr.i64.u %1, %c32 : i64
  %13 = vm.trunc.i64.i32 %12 : i64 -> i32
  %14 = vm.trunc.i64.i32 %2 : i64 -> i32
  %15 = vm.shr.i64.u %2, %c32 : i64
  %16 = vm.trunc.i64.i32 %15 : i64 -> i32
  %17 = vm.trunc.i64.i32 %3 : i64 -> i32
  %18 = vm.shr.i64.u %3, %c32 : i64
  %19 = vm.trunc.i64.i32 %18 : i64 -> i32
  %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
  vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
  %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
  vm.cond_br %20, ^bb2, ^bb1
^bb1:  // pred: ^bb0
  %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
  vm.return %ref_11 : !vm.ref<!hal.buffer_view>
^bb2:  // pred: ^bb0
  vm.fail %20, "failed to wait on timepoint"
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13 = vm.const.i32 13
  %c28 = vm.const.i32 28
  %c4 = vm.const.i32 4
  %c7 = vm.const.i32 7
  %c32 = vm.const.i32 32
  %c48 = vm.const.i32 48
  %c3075 = vm.const.i32 3075
  %c16 = vm.const.i32 16
  %c553648160 = vm.const.i32 553648160
  %c3 = vm.const.i32 3
  %c2 = vm.const.i32 2
  %c1 = vm.const.i32 1
  %zero = vm.const.i32.zero
  %c9 = vm.const.i64 9
  %c-1 = vm.const.i32 -1
  %c7_0 = vm.const.i64 7
  %c2_1 = vm.const.i64 2
  %zero_2 = vm.const.i64.zero
  %null = vm.const.ref.zero : !vm.ref<!hal.fence>
  %c-1_3 = vm.const.i64 -1
  %c2016 = vm.const.i64 2016
  %c2400 = vm.const.i64 2400
  %c5 = vm.const.i64 5
  %c4_4 = vm.const.i64 4
  %c6 = vm.const.i64 6
  %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
  %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %4 = vm.mul.i64 %0, %c4_4 : i64
  %5 = vm.mul.i64 %4, %1 : i64
  %6 = vm.mul.i64 %5, %2 : i64
  %7 = vm.mul.i64 %6, %3 : i64
  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
  %8 = vm.trunc.i64.i32 %0 : i64 -> i32
  %9 = vm.shr.i64.u %0, %c32 : i64
  %10 = vm.trunc.i64.i32 %9 : i64 -> i32
  %11 = vm.trunc.i64.i32 %1 : i64 -> i32
  %12 = vm.shr.i64.u %1, %c32 : i64
  %13 = vm.trunc.i64.i32 %12 : i64 -> i32
  %14 = vm.trunc.i64.i32 %2 : i64 -> i32
  %15 = vm.shr.i64.u %2, %c32 : i64
  %16 = vm.trunc.i64.i32 %15 : i64 -> i32
  %17 = vm.trunc.i64.i32 %3 : i64 -> i32
  %18 = vm.shr.i64.u %3, %c32 : i64
  %19 = vm.trunc.i64.i32 %18 : i64 -> i32
  %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
  vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
  %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
  vm.cond_br %20, ^bb2, ^bb1
^bb1:  // pred: ^bb0
  %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
  vm.return %ref_11 : !vm.ref<!hal.buffer_view>
^bb2:  // pred: ^bb0
  vm.fail %20, "failed to wait on timepoint"
}

// -----// IR Dump After Inliner (inline) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i32, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i32, %pattern_length : i32)
    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32)
    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64)
    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64)
    vm.import private @hal.command_buffer.dispatch2(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.command_buffer.dispatch2.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {minimum_version = 4 : i32}
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>)
    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i32)
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.executable.create2(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {minimum_version = 4 : i32, nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.join(%fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.initializer {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- //
vm.module public @module {
  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.initializer {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref immutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.br ^bb10
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  ^bb10:  // pred: ^bb8
    vm.return
  }
  vm.export @__deinit
  vm.func private @__deinit() {
    vm.return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.br ^bb10
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    ^bb10:  // pred: ^bb8
      vm.return
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump Before CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump After CSE (cse) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
  vm.export @__deinit
  vm.func private @__deinit() {
    vm.return
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump Before DropCompilerHints (iree-util-drop-compiler-hints) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
  }
}


// -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- //
module attributes {vm.toplevel} {
  vm.module public @module {
    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
    vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
    vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
    vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
    vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
    vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
    vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
    vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
    vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
    vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
    vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
    vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
    vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
    vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
    vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %c4 = vm.const.i32 4
      %c7 = vm.const.i32 7
      %c32 = vm.const.i32 32
      %c48 = vm.const.i32 48
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i32 16
      %c553648160 = vm.const.i32 553648160
      %c3 = vm.const.i32 3
      %c2 = vm.const.i32 2
      %c1 = vm.const.i32 1
      %zero = vm.const.i32.zero
      %c9 = vm.const.i64 9
      %c-1 = vm.const.i32 -1
      %c7_0 = vm.const.i64 7
      %c2_1 = vm.const.i64 2
      %zero_2 = vm.const.i64.zero
      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
      %c-1_3 = vm.const.i64 -1
      %c2016 = vm.const.i64 2016
      %c2400 = vm.const.i64 2400
      %c5 = vm.const.i64 5
      %c4_4 = vm.const.i64 4
      %c6 = vm.const.i64 6
      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
      %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
      %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %4 = vm.mul.i64 %0, %c4_4 : i64
      %5 = vm.mul.i64 %4, %1 : i64
      %6 = vm.mul.i64 %5, %2 : i64
      %7 = vm.mul.i64 %6, %3 : i64
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %8 = vm.trunc.i64.i32 %0 : i64 -> i32
      %9 = vm.shr.i64.u %0, %c32 : i64
      %10 = vm.trunc.i64.i32 %9 : i64 -> i32
      %11 = vm.trunc.i64.i32 %1 : i64 -> i32
      %12 = vm.shr.i64.u %1, %c32 : i64
      %13 = vm.trunc.i64.i32 %12 : i64 -> i32
      %14 = vm.trunc.i64.i32 %2 : i64 -> i32
      %15 = vm.shr.i64.u %2, %c32 : i64
      %16 = vm.trunc.i64.i32 %15 : i64 -> i32
      %17 = vm.trunc.i64.i32 %3 : i64 -> i32
      %18 = vm.shr.i64.u %3, %c32 : i64
      %19 = vm.trunc.i64.i32 %18 : i64 -> i32
      %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
      vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
      vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
      %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
      vm.cond_br %20, ^bb2, ^bb1
    ^bb1:  // pred: ^bb0
      %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_11 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %20, "failed to wait on timepoint"
    }
    vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.buffer
      %c8 = vm.const.i32 8
      %c2 = vm.const.i32 2
      %c3 = vm.const.i32 3
      %c7 = vm.const.i32 7
      %c1 = vm.const.i32 1
      %c14 = vm.const.i32 14
      %c-1 = vm.const.i64 -1
      %c5 = vm.const.i32 5
      %zero = vm.const.i32.zero
      %zero_0 = vm.const.i64.zero
      %c1_1 = vm.const.i64 1
      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
      %5 = vm.xor.i32 %rnz, %c1 : i32
      %slt = vm.cmp.lt.i64.s %2, %1 : i64
      %6 = vm.and.i32 %5, %slt : i32
      vm.cond_br %6, ^bb2, ^bb5
    ^bb2:  // pred: ^bb1
      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
      %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
      %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz = vm.cmp.nz.i64 %8#1 : i64
      %9 = vm.select.i32 %8#0, %nz, %zero : i32
      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
    ^bb3:  // pred: ^bb2
      %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
      vm.br ^bb4(%11 : i32)
    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
      %14 = vm.add.i64 %3, %13 : i64
      %15 = vm.and.i32 %12, %eq : i32
      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
      %16 = vm.add.i64 %2, %c1_1 : i64
      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
    ^bb5:  // pred: ^bb1
      vm.cond_br %5, ^bb6, ^bb7
    ^bb6:  // pred: ^bb5
      vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
    ^bb7:  // pred: ^bb5
      %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
      %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
      %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
      %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
      vm.cond_br %eq_10, ^bb8, ^bb9
    ^bb8:  // pred: ^bb7
      %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
      vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
      vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
      vm.return
    ^bb9:  // pred: ^bb7
      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
    }
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass (iree-vm-global-initialization) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.br ^bb10
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  ^bb10:  // pred: ^bb8
    vm.return
  }
  vm.export @__deinit
  vm.func private @__deinit() {
    vm.return
  }
}

// -----// IR Dump Before mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.br ^bb10
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  ^bb10:  // pred: ^bb8
    vm.return
  }
  vm.export @__deinit
  vm.func private @__deinit() {
    vm.return
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.br ^bb10
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  ^bb10:  // pred: ^bb8
    vm.return
  }
}

// -----// IR Dump Before Inliner (inline) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.br ^bb10
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  ^bb10:  // pred: ^bb8
    vm.return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13 = vm.const.i32 13
  %c28 = vm.const.i32 28
  %c4 = vm.const.i32 4
  %c7 = vm.const.i32 7
  %c32 = vm.const.i32 32
  %c48 = vm.const.i32 48
  %c3075 = vm.const.i32 3075
  %c16 = vm.const.i32 16
  %c553648160 = vm.const.i32 553648160
  %c3 = vm.const.i32 3
  %c2 = vm.const.i32 2
  %c1 = vm.const.i32 1
  %zero = vm.const.i32.zero
  %c9 = vm.const.i64 9
  %c-1 = vm.const.i32 -1
  %c7_0 = vm.const.i64 7
  %c2_1 = vm.const.i64 2
  %zero_2 = vm.const.i64.zero
  %null = vm.const.ref.zero : !vm.ref<!hal.fence>
  %c-1_3 = vm.const.i64 -1
  %c2016 = vm.const.i64 2016
  %c2400 = vm.const.i64 2400
  %c5 = vm.const.i64 5
  %c4_4 = vm.const.i64 4
  %c6 = vm.const.i64 6
  %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
  %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %4 = vm.mul.i64 %0, %c4_4 : i64
  %5 = vm.mul.i64 %4, %1 : i64
  %6 = vm.mul.i64 %5, %2 : i64
  %7 = vm.mul.i64 %6, %3 : i64
  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
  %8 = vm.trunc.i64.i32 %0 : i64 -> i32
  %9 = vm.shr.i64.u %0, %c32 : i64
  %10 = vm.trunc.i64.i32 %9 : i64 -> i32
  %11 = vm.trunc.i64.i32 %1 : i64 -> i32
  %12 = vm.shr.i64.u %1, %c32 : i64
  %13 = vm.trunc.i64.i32 %12 : i64 -> i32
  %14 = vm.trunc.i64.i32 %2 : i64 -> i32
  %15 = vm.shr.i64.u %2, %c32 : i64
  %16 = vm.trunc.i64.i32 %15 : i64 -> i32
  %17 = vm.trunc.i64.i32 %3 : i64 -> i32
  %18 = vm.shr.i64.u %3, %c32 : i64
  %19 = vm.trunc.i64.i32 %18 : i64 -> i32
  %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
  vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
  %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
  vm.cond_br %20, ^bb2, ^bb1
^bb1:  // pred: ^bb0
  %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
  vm.return %ref_11 : !vm.ref<!hal.buffer_view>
^bb2:  // pred: ^bb0
  vm.fail %20, "failed to wait on timepoint"
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
  %c13 = vm.const.i32 13
  %c28 = vm.const.i32 28
  %c4 = vm.const.i32 4
  %c7 = vm.const.i32 7
  %c32 = vm.const.i32 32
  %c48 = vm.const.i32 48
  %c3075 = vm.const.i32 3075
  %c16 = vm.const.i32 16
  %c553648160 = vm.const.i32 553648160
  %c3 = vm.const.i32 3
  %c2 = vm.const.i32 2
  %c1 = vm.const.i32 1
  %zero = vm.const.i32.zero
  %c9 = vm.const.i64 9
  %c-1 = vm.const.i32 -1
  %c7_0 = vm.const.i64 7
  %c2_1 = vm.const.i64 2
  %zero_2 = vm.const.i64.zero
  %null = vm.const.ref.zero : !vm.ref<!hal.fence>
  %c-1_3 = vm.const.i64 -1
  %c2016 = vm.const.i64 2016
  %c2400 = vm.const.i64 2400
  %c5 = vm.const.i64 5
  %c4_4 = vm.const.i64 4
  %c6 = vm.const.i64 6
  %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
  %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
  %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %4 = vm.mul.i64 %0, %c4_4 : i64
  %5 = vm.mul.i64 %4, %1 : i64
  %6 = vm.mul.i64 %5, %2 : i64
  %7 = vm.mul.i64 %6, %3 : i64
  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
  %8 = vm.trunc.i64.i32 %0 : i64 -> i32
  %9 = vm.shr.i64.u %0, %c32 : i64
  %10 = vm.trunc.i64.i32 %9 : i64 -> i32
  %11 = vm.trunc.i64.i32 %1 : i64 -> i32
  %12 = vm.shr.i64.u %1, %c32 : i64
  %13 = vm.trunc.i64.i32 %12 : i64 -> i32
  %14 = vm.trunc.i64.i32 %2 : i64 -> i32
  %15 = vm.shr.i64.u %2, %c32 : i64
  %16 = vm.trunc.i64.i32 %15 : i64 -> i32
  %17 = vm.trunc.i64.i32 %3 : i64 -> i32
  %18 = vm.shr.i64.u %3, %c32 : i64
  %19 = vm.trunc.i64.i32 %18 : i64 -> i32
  %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
  vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
  vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
  %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
  vm.cond_br %20, ^bb2, ^bb1
^bb1:  // pred: ^bb0
  %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
  vm.return %ref_11 : !vm.ref<!hal.buffer_view>
^bb2:  // pred: ^bb0
  vm.fail %20, "failed to wait on timepoint"
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
vm.func private @__init() {
  %null = vm.const.ref.zero : !vm.buffer
  %c8 = vm.const.i32 8
  %c2 = vm.const.i32 2
  %c3 = vm.const.i32 3
  %c7 = vm.const.i32 7
  %c1 = vm.const.i32 1
  %c14 = vm.const.i32 14
  %c-1 = vm.const.i64 -1
  %c5 = vm.const.i32 5
  %zero = vm.const.i32.zero
  %zero_0 = vm.const.i64.zero
  %c1_1 = vm.const.i64 1
  %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
  %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
  %1 = vm.ext.i32.i64.s %0 : i32 -> i64
  vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
  %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
  %5 = vm.xor.i32 %rnz, %c1 : i32
  %slt = vm.cmp.lt.i64.s %2, %1 : i64
  %6 = vm.and.i32 %5, %slt : i32
  vm.cond_br %6, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %7 = vm.trunc.i64.i32 %2 : i64 -> i32
  %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
  %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
  %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
  %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz = vm.cmp.nz.i64 %8#1 : i64
  %9 = vm.select.i32 %8#0, %nz, %zero : i32
  vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
^bb3:  // pred: ^bb2
  %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_3 = vm.cmp.nz.i64 %10#1 : i64
  %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
  vm.br ^bb4(%11 : i32)
^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
  %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
  %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
  %14 = vm.add.i64 %3, %13 : i64
  %15 = vm.and.i32 %12, %eq : i32
  %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
  %16 = vm.add.i64 %2, %c1_1 : i64
  vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
^bb5:  // pred: ^bb1
  vm.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
^bb7:  // pred: ^bb5
  %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_7 = vm.cmp.nz.i64 %17#1 : i64
  %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
  %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
  %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
  %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
  vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
  vm.cond_br %eq_10, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
  %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
  vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.br ^bb10
^bb9:  // pred: ^bb7
  vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
^bb10:  // pred: ^bb8
  vm.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
vm.func private @__init() {
  %null = vm.const.ref.zero : !vm.buffer
  %c8 = vm.const.i32 8
  %c2 = vm.const.i32 2
  %c3 = vm.const.i32 3
  %c7 = vm.const.i32 7
  %c1 = vm.const.i32 1
  %c14 = vm.const.i32 14
  %c-1 = vm.const.i64 -1
  %c5 = vm.const.i32 5
  %zero = vm.const.i32.zero
  %zero_0 = vm.const.i64.zero
  %c1_1 = vm.const.i64 1
  %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
  %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
  %1 = vm.ext.i32.i64.s %0 : i32 -> i64
  vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
  %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
  %5 = vm.xor.i32 %rnz, %c1 : i32
  %slt = vm.cmp.lt.i64.s %2, %1 : i64
  %6 = vm.and.i32 %5, %slt : i32
  vm.cond_br %6, ^bb2, ^bb5
^bb2:  // pred: ^bb1
  %7 = vm.trunc.i64.i32 %2 : i64 -> i32
  %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
  %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
  %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
  %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz = vm.cmp.nz.i64 %8#1 : i64
  %9 = vm.select.i32 %8#0, %nz, %zero : i32
  vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
^bb3:  // pred: ^bb2
  %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_3 = vm.cmp.nz.i64 %10#1 : i64
  %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
  vm.br ^bb4(%11 : i32)
^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
  %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
  %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
  %14 = vm.add.i64 %3, %13 : i64
  %15 = vm.and.i32 %12, %eq : i32
  %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
  %16 = vm.add.i64 %2, %c1_1 : i64
  vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
^bb5:  // pred: ^bb1
  vm.cond_br %5, ^bb6, ^bb7
^bb6:  // pred: ^bb5
  vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
^bb7:  // pred: ^bb5
  %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
  %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
  %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
  %nz_7 = vm.cmp.nz.i64 %17#1 : i64
  %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
  %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
  %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
  %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
  vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
  vm.cond_br %eq_10, ^bb8, ^bb9
^bb8:  // pred: ^bb7
  %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
  %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
  vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.return
^bb9:  // pred: ^bb7
  vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
}

// -----// IR Dump After Inliner (inline) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump Before CSE (cse) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump After CSE (cse) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump Before DropCompilerHints (iree-util-drop-compiler-hints) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump Before mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- //
vm.module public @module {
  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32)
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...)
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64)
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...)
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence>
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}}
  vm.export @__init
  vm.func private @__init() {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- //
vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts<import_funcs = 22, export_funcs = 2, internal_funcs = 2, global_bytes = 0, global_refs = 3, rodatas = 8, rwdatas = 0>} {
  vm.global.ref private mutable @__device_0 {ordinal = 0 : i32} : !vm.ref<!hal.device>
  vm.global.ref private mutable @__device_0_pipeline_layout_0 {ordinal = 1 : i32} : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @__device_0_executable_0_main_dispatch_0 {ordinal = 2 : i32} : !vm.ref<!hal.executable>
  vm.rodata private @_utf8_hal_device_id_D0F1B3E9D63E707C {alignment = 1 : i64, ordinal = 0 : i32} "hal.device.id"
  vm.rodata private @_utf8_local_8DC315A014BAFA34 {alignment = 1 : i64, ordinal = 1 : i32} "local*"
  vm.rodata private @_utf8_hal_executable_format_1F9665C75F0004D3 {alignment = 1 : i64, ordinal = 2 : i32} "hal.executable.format"
  vm.rodata private @_utf8_embedded_elf_x86_64_11EF7D6636570B50 {alignment = 1 : i64, ordinal = 3 : i32} "embedded-elf-x86_64"
  vm.rodata private @main_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf", ordinal = 4 : i32} dense<"0x7F454C4602010100000000000000000003003E000100000000000000000000004000000000000000E80A0000000000000000000040003800070040001500130006000000040000004000000000000000400000000000000040000000000000008801000000000000880100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000340400000000000034040000000000000010000000000000010000000500000040040000000000004014000000000000401400000000000041020000000000004102000000000000001000000000000001000000060000009006000000000000902600000000000090260000000000008801000000000000700900000000000000100000000000000200000006000000580700000000000058270000000000005827000000000000C000000000000000C000000000000000080000000000000052E574640400000090060000000000009026000000000000902600000000000088010000000000007009000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120007007016000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000982600000000000008000000000000005803000000000000A82600000000000008000000000000004014000000000000B02600000000000008000000000000006C03000000000000C0260000000000000800000000000000A003000000000000D8260000000000000800000000000000D803000000000000E0260000000000000800000000000000D803000000000000F0260000000000000800000000000000902600000000000010270000000000000800000000000000A82600000000000018270000000000000800000000000000680300000000000020270000000000000800000000000000B02600000000000030270000000000000800000000000000B82600000000000038270000000000000800000000000000D0260000000000006D61696E5F64697370617463685F3000000008036D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002F686F6D652F6D61686573682F697265652F7363726174636873706163652F6973737565732F31383238332F6261642E6D6C6972000000001400000000000000017A5200017810011B0C070890010000280000001C000000481000002D02000000410E108602430D064983078C068D058E048F03031F020C0708000010000000480000004C120000110000000000000000000000000000000000000000000000554889E54157415641554154534C8B4E18488B7620498B4110418B7918458B411C4D89C249C1E2204C8B5E088B0A8B5A044869D3580200004C01DA4881C2F40100004C69DBFC0000004C035E10488D1CC94909FA490FAFCA48C1E10248030E498D349B488975D0490FAF4108490FAFC248C1E00249C1E022498D3CB84531C0904D69C8F00300004C034DD04989CA4531DB6666666666662E0F1F8400000000000F57C04889D34D89D64531FF0F1F40004531E4666666662E0F1F8400000000004F8D2CA6F2430F100CA6F3430F1054A6080FC6D1300FC6CA84498D743D00F2420F10142FF3420F105C2F080FC6DA300FC6D3844C8D2C3EF20F101C37F30F106437080FC6E3300FC6DC84498D743D00F2420F102C2FF3420F10642F080FC6E5300FC6EC844C8D2C3EF20F103437F30F106437080FC6E6300FC6F484F2420F10242FF3420F107C2F080FC6FC300FC6E784F3420F10BCA30CFEFFFFF3460F1084A370FEFFFFF3460F108CA3D4FEFFFFF3460F1094A338FFFFFF0FC6FF000F59F90F58F8450FC6C000440F59C2440F58C7450FC6C900440F59CB450F58C8450FC6D200440F59D5450F58D1F3420F104CA39C0FC6C9000F59CE410F58CAF3420F1004A30FC6C0000F59C40F58C149FFC44983FC050F85E8FEFFFF49FFC74901FE4883C3144983FF050F85C4FEFFFF430F1304990F12C0F3430F11449908498D73034983C20C4983FB064989F30F8290FEFFFF49FFC04801C14983F8020F8560FEFFFF31C05B415C415D415E415F5DC3CCCCCC31C083FF04488D0D74100000480F44C1C3000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000004B0000003400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B0000004700000004000000000008013A0000002C00380000000000000040140000000000002D0200000240140000000000002D020000015604000000040000000101430000000300000000050400696E74006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F663332002D004952454500460000000200000000004B000000260000006D61696E5F64697370617463685F305F636F6E765F32645F6E6368775F666368775F327834783778397836783578355F6633320000000000160000000200000000004B00000043000000696E74000000000055000000040019000000010101FB0E0D000101010100000001000001002D000000000000090240140000000000000105010A03CA00C8054408A5050103794A0603B57F0253010820089E7403CB00C80B02E20212020B00010149524545000000000000000000000000000000000000000000000000000000000000230000000002090058270000000000000000000000000000010000001200070070160000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000058030000000000005803000000000000800000000000000000000000000000000800000000000000000000000000000029000000010000000200000000000000D803000000000000D8030000000000005C00000000000000000000000000000008000000000000000000000000000000330000000100000006000000000000004014000000000000400400000000000041020000000000000000000000000000100000000000000000000000000000003900000001000000030000000000000090260000000000009006000000000000C8000000000000000000000000000000100000000000000000000000000000004600000006000000030000000000000058270000000000005807000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000018280000000000001808000000000000E8070000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000180800000000000037000000000000000000000000000000010000000000000000000000000000006C00000001000000000000000000000000000000000000004F080000000000004B000000000000000000000000000000010000000000000000000000000000007800000001000000300000000000000000000000000000009A080000000000003F00000000000000000000000000000001000000000000000100000000000000830000000100000000000000000000000000000000000000D9080000000000004A0000000000000000000000000000000100000000000000000000000000000093000000010000000000000000000000000000000000000023090000000000001A00000000000000000000000000000001000000000000000000000000000000A300000001000000000000000000000000000000000000003D090000000000005900000000000000000000000000000001000000000000000000000000000000AF000000010000003000000000000000000000000000000096090000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000A0090000000000004800000000000000140000000200000008000000000000001800000000000000C00000000300000000000000000000000000000000000000E809000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000BA0A0000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<4136xi8>
  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {ordinal = 0 : i32}
  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, ordinal = 1 : i32}
  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {ordinal = 2 : i32}
  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, ordinal = 3 : i32}
  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, ordinal = 4 : i32}
  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 3 : i32, ordinal = 5 : i32}
  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {ordinal = 6 : i32}
  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {ordinal = 7 : i32}
  vm.import private @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %offset : i32, %values : i32 ...) attributes {ordinal = 8 : i32}
  vm.import private @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %pipeline_layout : !vm.ref<!hal.pipeline_layout>, %set : i32, %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {ordinal = 9 : i32}
  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64) attributes {ordinal = 10 : i32}
  vm.import private @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %flags : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, ordinal = 11 : i32}
  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, ordinal = 12 : i32}
  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects, ordinal = 13 : i32}
  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i32, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {ordinal = 14 : i32}
  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffers : !vm.ref<!hal.command_buffer> ...) attributes {ordinal = 15 : i32}
  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects, ordinal = 16 : i32}
  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects, ordinal = 17 : i32}
  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %pipeline_layouts : !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, ordinal = 18 : i32}
  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i32) -> !vm.ref<!hal.fence> attributes {ordinal = 19 : i32}
  vm.import private @hal.fence.await(%timeout_millis : i32, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {ordinal = 20 : i32, vm.yield}
  vm.import private @hal.pipeline_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout> attributes {nosideeffects, ordinal = 21 : i32}
  vm.rodata private @_utf8_input0_DA9A70D360954439 {alignment = 1 : i64, ordinal = 5 : i32} "input0"
  vm.rodata private @_utf8_tensor_41A152EEDB094D7A {alignment = 1 : i64, ordinal = 6 : i32} "tensor"
  vm.rodata private @_utf8_input1_FDCC539DA203DDD3 {alignment = 1 : i64, ordinal = 7 : i32} "input1"
  vm.func private @main(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}, ordinal = 0 : i32} {
    %c13 = vm.const.i32 13
    %c28 = vm.const.i32 28
    %c4 = vm.const.i32 4
    %c7 = vm.const.i32 7
    %c32 = vm.const.i32 32
    %c48 = vm.const.i32 48
    %c3075 = vm.const.i32 3075
    %c16 = vm.const.i32 16
    %c553648160 = vm.const.i32 553648160
    %c3 = vm.const.i32 3
    %c2 = vm.const.i32 2
    %c1 = vm.const.i32 1
    %zero = vm.const.i32.zero
    %c9 = vm.const.i64 9
    %c-1 = vm.const.i32 -1
    %c7_0 = vm.const.i64 7
    %c2_1 = vm.const.i64 2
    %zero_2 = vm.const.i64.zero
    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
    %c-1_3 = vm.const.i64 -1
    %c2016 = vm.const.i64 2016
    %c2400 = vm.const.i64 2400
    %c5 = vm.const.i64 5
    %c4_4 = vm.const.i64 4
    %c6 = vm.const.i64 6
    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
    %__device_0_pipeline_layout_0 = vm.global.load.ref @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    %__device_0_executable_0_main_dispatch_0 = vm.global.load.ref @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    %0 = vm.call @hal.buffer_view.dim(%arg0, %zero) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %1 = vm.call @hal.buffer_view.dim(%arg0, %c1) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %2 = vm.call @hal.buffer_view.dim(%arg0, %c2) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %3 = vm.call @hal.buffer_view.dim(%arg0, %c3) {nosideeffects} : (!vm.ref<!hal.buffer_view>, i32) -> i64
    %_utf8_input0_DA9A70D360954439 = vm.const.ref.rodata @_utf8_input0_DA9A70D360954439 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DA9A70D360954439, %c553648160, %c1, [%0, %1, %2, %3]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %4 = vm.mul.i64 %0, %c4_4 : i64
    %5 = vm.mul.i64 %4, %1 : i64
    %6 = vm.mul.i64 %5, %2 : i64
    %7 = vm.mul.i64 %6, %3 : i64
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_5 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_41A152EEDB094D7A = vm.const.ref.rodata @_utf8_tensor_41A152EEDB094D7A : !vm.buffer
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %7, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_input1_FDCC539DA203DDD3 = vm.const.ref.rodata @_utf8_input1_FDCC539DA203DDD3 : !vm.buffer
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_FDCC539DA203DDD3, %c553648160, %c1, [%c4_4, %c6, %c5, %c5]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_41A152EEDB094D7A, %ref_5, %c2400, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %ref_7 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    %ref_8 = vm.call @hal.device.queue.alloca(%__device_0, %c-1_3, %null, %ref_7, %zero, %c48, %c3075, %c2016) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i32, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %8 = vm.trunc.i64.i32 %0 : i64 -> i32
    %9 = vm.shr.i64.u %0, %c32 : i64
    %10 = vm.trunc.i64.i32 %9 : i64 -> i32
    %11 = vm.trunc.i64.i32 %1 : i64 -> i32
    %12 = vm.shr.i64.u %1, %c32 : i64
    %13 = vm.trunc.i64.i32 %12 : i64 -> i32
    %14 = vm.trunc.i64.i32 %2 : i64 -> i32
    %15 = vm.shr.i64.u %2, %c32 : i64
    %16 = vm.trunc.i64.i32 %15 : i64 -> i32
    %17 = vm.trunc.i64.i32 %3 : i64 -> i32
    %18 = vm.shr.i64.u %3, %c32 : i64
    %19 = vm.trunc.i64.i32 %18 : i64 -> i32
    %ref_9 = vm.call @hal.command_buffer.create(%__device_0, %c1, %c3, %c-1_3, %zero) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
    vm.call.variadic @hal.command_buffer.push_constants(%ref_9, %__device_0_pipeline_layout_0, %zero, [%8, %10, %11, %13, %14, %16, %17, %19]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, i32 ...)
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %__device_0_pipeline_layout_0, %zero, [(%zero, %zero, %ref, %zero_2, %7), (%c1, %zero, %ref_6, %zero_2, %c2400), (%c2, %zero, %ref_8, %zero_2, %c2016)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.pipeline_layout>, i32, tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    vm.call @hal.command_buffer.dispatch(%ref_9, %__device_0_executable_0_main_dispatch_0, %zero, %c7, %c4, %c1, %zero_2) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64) -> ()
    vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.finalize(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
    %ref_10 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.fence>
    vm.call.variadic @hal.device.queue.execute(%__device_0, %c-1_3, %ref_7, %ref_10, [%ref_9]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer> ...)
    %20 = vm.call.variadic @hal.fence.await(%c-1, [%ref_10]) : (i32, !vm.ref<!hal.fence> ...) -> i32
    vm.cond_br %20, ^bb2, ^bb1
  ^bb1:  // pred: ^bb0
    %ref_11 = vm.call.variadic @hal.buffer_view.create(%ref_8, %zero_2, %c2016, %c553648160, %c1, [%c2_1, %c4_4, %c7_0, %c9]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_11 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    vm.fail %20, "failed to wait on timepoint"
  }
  vm.export @main attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @main(%input0: tensor<?x?x?x?xf32>, %input1: tensor<4x6x5x5xf32>) -> (%output0: tensor<2x4x7x9xf32>)"}, ordinal = 0 : i32}
  vm.export @__init attributes {ordinal = 1 : i32}
  vm.func private @__init() attributes {ordinal = 1 : i32} {
    %null = vm.const.ref.zero : !vm.buffer
    %c8 = vm.const.i32 8
    %c2 = vm.const.i32 2
    %c3 = vm.const.i32 3
    %c7 = vm.const.i32 7
    %c1 = vm.const.i32 1
    %c14 = vm.const.i32 14
    %c-1 = vm.const.i64 -1
    %c5 = vm.const.i32 5
    %zero = vm.const.i32.zero
    %zero_0 = vm.const.i64.zero
    %c1_1 = vm.const.i64 1
    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
    %5 = vm.xor.i32 %rnz, %c1 : i32
    %slt = vm.cmp.lt.i64.s %2, %1 : i64
    %6 = vm.and.i32 %5, %slt : i32
    vm.cond_br %6, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
    %_utf8_hal_device_id_D0F1B3E9D63E707C = vm.const.ref.rodata @_utf8_hal_device_id_D0F1B3E9D63E707C : !vm.buffer
    %_utf8_local_8DC315A014BAFA34 = vm.const.ref.rodata @_utf8_local_8DC315A014BAFA34 : !vm.buffer
    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_D0F1B3E9D63E707C, %_utf8_local_8DC315A014BAFA34) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz = vm.cmp.nz.i64 %8#1 : i64
    %9 = vm.select.i32 %8#0, %nz, %zero : i32
    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
  ^bb3:  // pred: ^bb2
    %_utf8_hal_executable_format_1F9665C75F0004D3 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_1F9665C75F0004D3, %_utf8_embedded_elf_x86_64_11EF7D6636570B50) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
    vm.br ^bb4(%11 : i32)
  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
    %14 = vm.add.i64 %3, %13 : i64
    %15 = vm.and.i32 %12, %eq : i32
    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
    %16 = vm.add.i64 %2, %c1_1 : i64
    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
  ^bb5:  // pred: ^bb1
    vm.cond_br %5, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vm.fail %c5, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-x86_64\22, {cpu = \22generic\22, cpu_features = \22\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128\22, native_vector_size = 16 : i64, target_triple = \22x86_64-unknown-unknown-eabi-elf\22}>]>"
  ^bb7:  // pred: ^bb5
    %_utf8_hal_executable_format_1F9665C75F0004D3_5 = vm.const.ref.rodata @_utf8_hal_executable_format_1F9665C75F0004D3 : !vm.buffer
    %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_11EF7D6636570B50 : !vm.buffer
    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_1F9665C75F0004D3_5, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
    %ref_8 = vm.call.variadic @hal.descriptor_set_layout.create(%4, %c1, [(%zero, %c7, %c3), (%c1, %c7, %c3), (%c2, %c7, %c2)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %ref_9 = vm.call.variadic @hal.pipeline_layout.create(%4, %c8, [%ref_8]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.pipeline_layout>
    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
    %eq_10 = vm.cmp.eq.i64 %19, %zero_0 : i64
    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
    vm.cond_br %eq_10, ^bb8, ^bb9
  ^bb8:  // pred: ^bb7
    %main_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @main_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %ref_11 = vm.call.variadic @hal.executable.create(%4, %_utf8_embedded_elf_x86_64_11EF7D6636570B50_6, %main_dispatch_0_embedded_elf_x86_64, %null, [%ref_9]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.pipeline_layout> ...) -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_11, @__device_0_executable_0_main_dispatch_0 : !vm.ref<!hal.executable>
    vm.global.store.ref %ref_9, @__device_0_pipeline_layout_0 : !vm.ref<!hal.pipeline_layout>
    vm.return
  ^bb9:  // pred: ^bb7
    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `main_dispatch_0`; available formats: [embedded-elf-x86_64]"
  }
}